Permalink
Please sign in to comment.
Showing
with
1,433 additions
and 0 deletions.
- +141 −0 .gitignore
- +20 −0 LICENSE
- +102 −0 README.md
- +7 −0 bootstrap.sh
- +28 −0 config.json
- +3 −0 proxy.json
- +8 −0 scripts/apikeys_sample.json
- +4 −0 scripts/crawl_user_networks.json
- +124 −0 scripts/crawl_user_networks.py
- +3 −0 scripts/crawl_user_timelines.json
- +67 −0 scripts/crawl_user_timelines.py
- +1 −0 scripts/keywords.json
- +61 −0 scripts/progress.pickle
- +63 −0 scripts/track_keywords.py
- +4 −0 scripts/twitter_crawler.json
- +167 −0 scripts/twitter_crawler.py
- +67 −0 tests/bootstrap_test.py
- +28 −0 tests/config.json
- +3 −0 tests/proxy.json
- 0 tweetf0rm/__init__.py
- +33 −0 tweetf0rm/bootstrap.py
- +97 −0 tweetf0rm/handler/file_handler.py
- +97 −0 tweetf0rm/handler/mongodb_handler.py
- 0 tweetf0rm/process/__init__.py
- +17 −0 tweetf0rm/process/worker_process.py
- +62 −0 tweetf0rm/twitterapi/stream.py
- +191 −0 tweetf0rm/twitterapi/user.py
- +35 −0 tweetf0rm/utils.py
141
.gitignore
| @@ -0,0 +1,141 @@ | ||
| +# Created by http://gitignore.io | ||
| + | ||
| +### Python ### | ||
| +*.py[cod] | ||
| + | ||
| +# C extensions | ||
| +*.so | ||
| + | ||
| +# Packages | ||
| +*.egg | ||
| +*.egg-info | ||
| +dist | ||
| +build | ||
| +eggs | ||
| +parts | ||
| +bin | ||
| +var | ||
| +sdist | ||
| +develop-eggs | ||
| +.installed.cfg | ||
| +lib | ||
| +lib64 | ||
| +__pycache__ | ||
| + | ||
| +# Installer logs | ||
| +pip-log.txt | ||
| + | ||
| +# Unit test / coverage reports | ||
| +.coverage | ||
| +.tox | ||
| +nosetests.xml | ||
| + | ||
| +# Translations | ||
| +*.mo | ||
| + | ||
| +# Mr Developer | ||
| +.mr.developer.cfg | ||
| +.project | ||
| +.pydevproject | ||
| + | ||
| +# Rope | ||
| +.ropeproject | ||
| + | ||
| + | ||
| +### Windows ### | ||
| +# Windows image file caches | ||
| +Thumbs.db | ||
| +ehthumbs.db | ||
| + | ||
| +# Folder config file | ||
| +Desktop.ini | ||
| + | ||
| +# Recycle Bin used on file shares | ||
| +$RECYCLE.BIN/ | ||
| + | ||
| +### OSX ### | ||
| +.DS_Store | ||
| +.AppleDouble | ||
| +.LSOverride | ||
| +Icon | ||
| + | ||
| + | ||
| +# Thumbnails | ||
| +._* | ||
| + | ||
| +# Files that might appear on external disk | ||
| +.Spotlight-V100 | ||
| +.Trashes | ||
| + | ||
| +### Linux ### | ||
| +.* | ||
| +!.gitignore | ||
| +!.git* | ||
| +*~ | ||
| + | ||
| + | ||
| +### SublimeText ### | ||
| +# SublimeText project files | ||
| +*.sublime-workspace | ||
| + | ||
| +### Eclipse ### | ||
| +*.pydevproject | ||
| +.project | ||
| +.metadata | ||
| +bin/** | ||
| +tmp/** | ||
| +tmp/**/* | ||
| +*.tmp | ||
| +*.bak | ||
| +*.swp | ||
| +*~.nib | ||
| +local.properties | ||
| +.classpath | ||
| +.settings/ | ||
| +.loadpath | ||
| + | ||
| +# External tool builders | ||
| +.externalToolBuilders/ | ||
| + | ||
| +# Locally stored "Eclipse launch configurations" | ||
| +*.launch | ||
| + | ||
| +# CDT-specific | ||
| +.cproject | ||
| + | ||
| +# PDT-specific | ||
| +.buildpath | ||
| + | ||
| +### LaTeX ### | ||
| +*.acn | ||
| +*.acr | ||
| +*.alg | ||
| +*.aux | ||
| +*.bbl | ||
| +*.blg | ||
| +*.dvi | ||
| +*.fdb_latexmk | ||
| +*.glg | ||
| +*.glo | ||
| +*.gls | ||
| +*.idx | ||
| +*.ilg | ||
| +*.ind | ||
| +*.ist | ||
| +*.lof | ||
| +*.log | ||
| +*.lot | ||
| +*.maf | ||
| +*.mtc | ||
| +*.mtc0 | ||
| +*.nav | ||
| +*.nlo | ||
| +*.out | ||
| +*.pdfsync | ||
| +*.ps | ||
| +*.snm | ||
| +*.synctex.gz | ||
| +*.toc | ||
| +*.vrb | ||
| +*.xdy | ||
| +*.tdo |
20
LICENSE
| @@ -0,0 +1,20 @@ | ||
| +The MIT License (MIT) | ||
| + | ||
| +Copyright (c) 2013 Jiang Bian | ||
| + | ||
| +Permission is hereby granted, free of charge, to any person obtaining a copy of | ||
| +this software and associated documentation files (the "Software"), to deal in | ||
| +the Software without restriction, including without limitation the rights to | ||
| +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | ||
| +the Software, and to permit persons to whom the Software is furnished to do so, | ||
| +subject to the following conditions: | ||
| + | ||
| +The above copyright notice and this permission notice shall be included in all | ||
| +copies or substantial portions of the Software. | ||
| + | ||
| +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | ||
| +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | ||
| +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | ||
| +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
102
README.md
| @@ -0,0 +1,102 @@ | ||
| +tweetf0rmer | ||
| +========= | ||
| + | ||
| +A Twitter crawler that helps you collect data from Twitter for research. Most of the heavy works are already done by [Twython](https://github.com/ryanmcgrath/twython). ``tweetf0rmer`` is just a collection of python scripts help to deal with errors such as connection failures. In most use cases, it will auto-restart when an exception occurs. Moreover, when the crawler exceeds the Twitter API's [rate limit](https://dev.twitter.com/docs/rate-limiting/1.1/limits), the crawler will pause itself and auto-restart later. | ||
| + | ||
| +It's quite stable for the things that I want to do; but I has been running some of the scripts for 15 days without many hiccups. | ||
| + | ||
| +One of the long term goal is to use [boto](http://boto.readthedocs.org/en/latest/) to integrate with the Amazon EC2 cluster so that you can run multiple crawlers to workaround Twitter's API rate limit. Helps are welcome! | ||
| + | ||
| +Installation | ||
| +------------ | ||
| + | ||
| +None... just clone this and start using it. It's not that complicated yet to have a setup.py.. | ||
| + | ||
| + git clone git://github.com/bianjiang/tweetf0rmer.git | ||
| + cd tweetf0rmer/scripts | ||
| + | ||
| +Dependencies | ||
| +------------ | ||
| +To run this, you will need: | ||
| +- [Twython](https://github.com/ryanmcgrath/twython) | ||
| +- [futures](https://pypi.python.org/pypi/futures) if you are on Python 2.7 | ||
| + | ||
| + | ||
| +Features | ||
| +------------ | ||
| + | ||
| +##### I am developing this for my own research, but feature requests or contributions are welcome for sure... | ||
| +##### If you see a problem, put in a ticket... | ||
| + | ||
| +Currently, three different scripts are provided (to meet my own needs); and they are all under the ``scripts`` folder. | ||
| + | ||
| +- Available scripts: | ||
| + - ``track_keywords.py``: Track a list of keywords (up to 4,000, as limited by Twitter API); and streaming all the Tweets that are related to these keywords; see Twitter API doc [status/filter](https://dev.twitter.com/docs/api/1.1/post/statuses/filter) | ||
| + - ``crawl_user_networks.py``: Starting from a list of ``seed`` users, this script will go out and find all their ``friends`` (or ``follower`` based-on setting) and their friends' friends until it reaches certain ``depth``. This is often used to create a friendship network for network analysis. | ||
| + - ``crawl_user_timelines.py``: This crawls a user's most recent tweets (up to 3,200, as limited by Twitter API). | ||
| + - ``twitter_crawler.py``: This basically combines ``crawl_user_networks.py`` and ``crawl_user_timelines.py``, so it will create the friendship network while crawling all the tweets from users in the network. | ||
| + | ||
| +##### I haven't tested Python 3 yet... | ||
| + | ||
| + | ||
| +How to use | ||
| +------------ | ||
| + | ||
| +First, you'll want to login the twitter dev site and create an applciation at https://dev.twitter.com/apps to have access to the Twitter API! | ||
| + | ||
| +After you register, create an access token and grab your applications ``Consumer Key``, ``Consumer Secret``, ``Access token`` and ``Access token secret`` from the OAuth tool tab. Put these information into a ``apikeys.json`` in the following format. | ||
| + | ||
| + | ||
| + { | ||
| + "i0mf0rmer" :{ | ||
| + "app_key":"CONSUMER_KEY", | ||
| + "app_secret":"CONSUMER_SECRET", | ||
| + "oauth_token":"ACCESS_TOKEN", | ||
| + "oauth_token_secret":"ACCESS_TOKEN_SECRET" | ||
| + } | ||
| + } | ||
| + | ||
| +The rest are fairly straigtforward, you can try to run e.g., ``python crawl_user_timelines.py --help`` to get help information about the parameters of each script. | ||
| + | ||
| + | ||
| + $python crawl_user_timelines.py --help | ||
| + usage: crawl_user_timelines.py [-h] -a APIKEYS -c CRAWLER -s SEEDS -o OUTPUT | ||
| + | ||
| + optional arguments: | ||
| + -h, --help show this help message and exit | ||
| + -a APIKEYS, --apikeys APIKEYS | ||
| + config file for twitter api key (json format) | ||
| + -c CRAWLER, --crawler CRAWLER | ||
| + the crawler identifier; you can have multiple crawler | ||
| + accounts set in the apikeys.json; pick one | ||
| + -s SEEDS, --seeds SEEDS | ||
| + the list of users you want to crawl their timelines; | ||
| + see crawl_user_timelines.json as an example | ||
| + -o OUTPUT, --output OUTPUT | ||
| + define the location of the output (each user's | ||
| + timeline will be in its own file under this output | ||
| + folder identified by the user id | ||
| + | ||
| + | ||
| +### License | ||
| +------------ | ||
| + | ||
| +The MIT License (MIT) | ||
| +Copyright (c) 2013 Jiang Bian (ji0ng.bi0n@gmail.com) | ||
| + | ||
| +Permission is hereby granted, free of charge, to any person obtaining a copy of | ||
| +this software and associated documentation files (the "Software"), to deal in | ||
| +the Software without restriction, including without limitation the rights to | ||
| +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | ||
| +the Software, and to permit persons to whom the Software is furnished to do so, | ||
| +subject to the following conditions: | ||
| + | ||
| +The above copyright notice and this permission notice shall be included in all | ||
| +copies or substantial portions of the Software. | ||
| + | ||
| +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | ||
| +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | ||
| +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | ||
| +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| @@ -0,0 +1,7 @@ | ||
| +#!/bin/bash | ||
| + | ||
| +#launchctl load ~/Library/LaunchAgents/homebrew.mxcl.redis.plist | ||
| +#launchctl load ~/Library/LaunchAgents/homebrew.mxcl.mongodb.plist | ||
| + | ||
| +mongod | ||
| +redis-server /usr/local/etc/redis.conf |
28
config.json
| @@ -0,0 +1,28 @@ | ||
| +{ | ||
| + "apikeys": { | ||
| + "i0mf0rmer01" :{ | ||
| + "app_key":"YZ43AjRJVEEu7vxj4k5Zg", | ||
| + "app_secret":"iJUmmTgPm4ZU7gxNCazatxx3VC5oRzUEI2BOLjgED0", | ||
| + "oauth_token":"1936968966-2avAkF0e2LClKsDf1AQS9o6x3sQkMHYg41ovRMm", | ||
| + "oauth_token_secret":"FUI816A2A1sSpL8q4Y53yqiTLHVQgcSh3WIHnNUmpA" | ||
| + }, | ||
| + "i0mf0rmer02" :{ | ||
| + "app_key":"QFzw7GJ8Oiz1ue0wUn1ZQg", | ||
| + "app_secret":"TyiAt3zuQbgqrFAdmXJl2G0zdT05TuiAhTdwzXH0", | ||
| + "oauth_token":"1946931948-P66BhnJJw3gti8v30LeSaz7yGC2mP8G0VcCj05s", | ||
| + "oauth_token_secret":"EhPhP0xXETxzuh6pS2hfupJg4nGhZZnzRWDo6X0mE" | ||
| + }, | ||
| + "i0mf0rmer03" :{ | ||
| + "app_key":"UREFgTupN8euVwMLXxRGKg", | ||
| + "app_secret":"FUxHixvkmk4Lxux64DBva1obfkAh10UuMxwXOD8", | ||
| + "oauth_token":"1948122342-uMqlGB7t8xXLcM69CrO9KWiABVdMtW8MKAKAPTv", | ||
| + "oauth_token_secret":"papLkhKwUKS2qZzIYYeKfmDNIxvX4QJHk7kJTIykPdbNx" | ||
| + } | ||
| + }, | ||
| + "redis": { | ||
| + "host": "localhost", | ||
| + "port": 6379, | ||
| + "db": 0, | ||
| + "password": "wh0tever" | ||
| + } | ||
| +} |
| @@ -0,0 +1,3 @@ | ||
| +{ | ||
| + "proxies":["58.20.127.100:3128", "58.20.223.230:3128", "210.22.63.90:8080", "211.167.112.14:82", "59.47.43.65:8080", "59.47.43.66:8080", "59.47.43.67:8080", "59.47.43.92:8080", "59.47.43.95:8080", "210.22.63.76:8080", "59.47.43.94:8080", "120.82.10.37:8080", "120.82.10.54:8080", "59.47.43.90:8080", "120.82.10.30:8080", "59.47.43.93:8080", "59.48.143.6:3128", "61.135.179.167:8080", "120.82.10.82:8080", "210.22.63.87:8080", "120.198.230.63:81", "210.22.63.79:8080", "218.108.232.187:80", "218.108.232.190:80", "59.47.43.88:8080", "116.231.213.245:8080", "222.87.129.29:80", "210.22.63.73:8080", "221.238.28.158:8081", "60.28.183.5:8081", "210.22.59.66:3128", "210.22.63.72:8080", "59.47.43.89:8080", "202.202.0.163:3128", "218.200.66.226:80", "218.200.66.234:80", "122.96.59.102:80", "120.82.10.25:8080", "116.255.241.111:808", "59.47.43.64:8080", "221.10.40.236:80", "221.10.40.237:80", "221.10.40.238:80", "218.104.148.59:3128", "221.10.40.232:82", "210.22.63.91:8080", "210.22.63.92:8080", "210.22.63.71:8080", "118.26.57.13:82", "59.47.43.91:8080", "221.10.102.199:82", "120.82.10.93:8080", "120.82.10.96:8080", "114.80.136.112:7780", "120.82.10.87:8080", "120.82.10.89:8080", "120.82.10.90:8080", "120.82.10.94:8080", "120.82.10.85:8080", "120.82.10.78:8080", "120.82.10.8:8080", "120.82.10.80:8080", "120.82.10.81:8080", "120.82.10.83:8080", "120.82.10.84:8080", "120.82.10.61:8080", "120.82.10.62:8080", "120.82.10.63:8080", "120.82.10.64:8080", "120.82.10.65:8080", "120.82.10.66:8080", "120.82.10.68:8080", "120.82.10.5:8080", "120.82.10.52:8080", "120.82.10.53:8080", "120.82.10.55:8080", "120.82.10.57:8080", "120.82.10.58:8080", "120.82.10.6:8080", "120.82.10.35:8080", "120.82.10.36:8080", "120.82.10.39:8080", "120.82.10.42:8080", "120.82.10.46:8080", "120.82.10.48:8080", "120.82.10.49:8080", "120.82.10.16:8080", "120.82.10.28:8080", "120.82.10.29:8080", "120.82.10.3:8080", "120.82.10.32:8080", "120.82.10.33:8080", "120.82.10.12:8080", "120.82.10.14:8080", "120.82.10.34:8080", "120.82.10.91:8080", "120.82.10.86:8080", "120.82.10.88:8080", "120.82.10.9:8080", "120.82.10.74:8080", "120.82.10.75:8080", "120.82.10.76:8080", "120.82.10.77:8080", "120.82.10.79:8080", "120.82.10.60:8080", "120.82.10.67:8080", "120.82.10.69:8080", "120.82.10.73:8080", "120.82.10.44:8080", "120.82.10.45:8080", "120.82.10.47:8080", "120.82.10.50:8080", "120.82.10.51:8080", "120.82.10.56:8080", "91.217.67.248:80", "120.82.10.38:8080", "120.82.10.40:8080", "120.82.10.41:8080", "120.82.10.43:8080", "122.96.59.106:81", "120.82.10.92:8080", "120.82.10.95:8080", "120.82.10.10:8080", "120.82.10.11:8080", "120.82.10.13:8080", "120.82.10.15:8080", "120.82.10.26:8080", "120.82.10.27:8080", "120.82.10.31:8080", "202.171.253.102:80", "218.207.195.206:80", "221.7.11.11:82", "218.108.232.188:80", "218.108.232.189:80", "120.82.10.4:8080", "119.233.255.24:82", "111.63.14.245:80", "115.25.216.6:80", "122.96.59.107:80", "120.202.249.230:80", "120.82.10.1:8080", "120.82.10.2:8080", "62.109.21.141:3128", "119.75.219.70:80", "213.175.174.94:80"] | ||
| +} |
| @@ -0,0 +1,8 @@ | ||
| +{ | ||
| + "i0mf0rmer" :{ | ||
| + "app_key":"CONSUMER_KEY", | ||
| + "app_secret":"CONSUMER_SECRET", | ||
| + "oauth_token":"ACCESS_TOKEN", | ||
| + "oauth_token_secret":"ACCESS_TOKEN_SECRET" | ||
| + } | ||
| +} |
| @@ -0,0 +1,4 @@ | ||
| +{ | ||
| + "seeds":["BarackObama"], | ||
| + "depth": 3 | ||
| +} |
Oops, something went wrong.
0 comments on commit
b8beab7