Permalink
Please sign in to comment.
Showing
with
1,433 additions
and 0 deletions.
- +141 −0 .gitignore
- +20 −0 LICENSE
- +102 −0 README.md
- +7 −0 bootstrap.sh
- +28 −0 config.json
- +3 −0 proxy.json
- +8 −0 scripts/apikeys_sample.json
- +4 −0 scripts/crawl_user_networks.json
- +124 −0 scripts/crawl_user_networks.py
- +3 −0 scripts/crawl_user_timelines.json
- +67 −0 scripts/crawl_user_timelines.py
- +1 −0 scripts/keywords.json
- +61 −0 scripts/progress.pickle
- +63 −0 scripts/track_keywords.py
- +4 −0 scripts/twitter_crawler.json
- +167 −0 scripts/twitter_crawler.py
- +67 −0 tests/bootstrap_test.py
- +28 −0 tests/config.json
- +3 −0 tests/proxy.json
- 0 tweetf0rm/__init__.py
- +33 −0 tweetf0rm/bootstrap.py
- +97 −0 tweetf0rm/handler/file_handler.py
- +97 −0 tweetf0rm/handler/mongodb_handler.py
- 0 tweetf0rm/process/__init__.py
- +17 −0 tweetf0rm/process/worker_process.py
- +62 −0 tweetf0rm/twitterapi/stream.py
- +191 −0 tweetf0rm/twitterapi/user.py
- +35 −0 tweetf0rm/utils.py
141
.gitignore
| @@ -0,0 +1,141 @@ | |||
| +# Created by http://gitignore.io | |||
| + | |||
| +### Python ### | |||
| +*.py[cod] | |||
| + | |||
| +# C extensions | |||
| +*.so | |||
| + | |||
| +# Packages | |||
| +*.egg | |||
| +*.egg-info | |||
| +dist | |||
| +build | |||
| +eggs | |||
| +parts | |||
| +bin | |||
| +var | |||
| +sdist | |||
| +develop-eggs | |||
| +.installed.cfg | |||
| +lib | |||
| +lib64 | |||
| +__pycache__ | |||
| + | |||
| +# Installer logs | |||
| +pip-log.txt | |||
| + | |||
| +# Unit test / coverage reports | |||
| +.coverage | |||
| +.tox | |||
| +nosetests.xml | |||
| + | |||
| +# Translations | |||
| +*.mo | |||
| + | |||
| +# Mr Developer | |||
| +.mr.developer.cfg | |||
| +.project | |||
| +.pydevproject | |||
| + | |||
| +# Rope | |||
| +.ropeproject | |||
| + | |||
| + | |||
| +### Windows ### | |||
| +# Windows image file caches | |||
| +Thumbs.db | |||
| +ehthumbs.db | |||
| + | |||
| +# Folder config file | |||
| +Desktop.ini | |||
| + | |||
| +# Recycle Bin used on file shares | |||
| +$RECYCLE.BIN/ | |||
| + | |||
| +### OSX ### | |||
| +.DS_Store | |||
| +.AppleDouble | |||
| +.LSOverride | |||
| +Icon | |||
| + | |||
| + | |||
| +# Thumbnails | |||
| +._* | |||
| + | |||
| +# Files that might appear on external disk | |||
| +.Spotlight-V100 | |||
| +.Trashes | |||
| + | |||
| +### Linux ### | |||
| +.* | |||
| +!.gitignore | |||
| +!.git* | |||
| +*~ | |||
| + | |||
| + | |||
| +### SublimeText ### | |||
| +# SublimeText project files | |||
| +*.sublime-workspace | |||
| + | |||
| +### Eclipse ### | |||
| +*.pydevproject | |||
| +.project | |||
| +.metadata | |||
| +bin/** | |||
| +tmp/** | |||
| +tmp/**/* | |||
| +*.tmp | |||
| +*.bak | |||
| +*.swp | |||
| +*~.nib | |||
| +local.properties | |||
| +.classpath | |||
| +.settings/ | |||
| +.loadpath | |||
| + | |||
| +# External tool builders | |||
| +.externalToolBuilders/ | |||
| + | |||
| +# Locally stored "Eclipse launch configurations" | |||
| +*.launch | |||
| + | |||
| +# CDT-specific | |||
| +.cproject | |||
| + | |||
| +# PDT-specific | |||
| +.buildpath | |||
| + | |||
| +### LaTeX ### | |||
| +*.acn | |||
| +*.acr | |||
| +*.alg | |||
| +*.aux | |||
| +*.bbl | |||
| +*.blg | |||
| +*.dvi | |||
| +*.fdb_latexmk | |||
| +*.glg | |||
| +*.glo | |||
| +*.gls | |||
| +*.idx | |||
| +*.ilg | |||
| +*.ind | |||
| +*.ist | |||
| +*.lof | |||
| +*.log | |||
| +*.lot | |||
| +*.maf | |||
| +*.mtc | |||
| +*.mtc0 | |||
| +*.nav | |||
| +*.nlo | |||
| +*.out | |||
| +*.pdfsync | |||
| +*.ps | |||
| +*.snm | |||
| +*.synctex.gz | |||
| +*.toc | |||
| +*.vrb | |||
| +*.xdy | |||
| +*.tdo | |||
20
LICENSE
| @@ -0,0 +1,20 @@ | |||
| +The MIT License (MIT) | |||
| + | |||
| +Copyright (c) 2013 Jiang Bian | |||
| + | |||
| +Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
| +this software and associated documentation files (the "Software"), to deal in | |||
| +the Software without restriction, including without limitation the rights to | |||
| +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | |||
| +the Software, and to permit persons to whom the Software is furnished to do so, | |||
| +subject to the following conditions: | |||
| + | |||
| +The above copyright notice and this permission notice shall be included in all | |||
| +copies or substantial portions of the Software. | |||
| + | |||
| +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | |||
| +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | |||
| +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | |||
| +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |||
| +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
102
README.md
| @@ -0,0 +1,102 @@ | |||
| +tweetf0rmer | |||
| +========= | |||
| + | |||
| +A Twitter crawler that helps you collect data from Twitter for research. Most of the heavy works are already done by [Twython](https://github.com/ryanmcgrath/twython). ``tweetf0rmer`` is just a collection of python scripts help to deal with errors such as connection failures. In most use cases, it will auto-restart when an exception occurs. Moreover, when the crawler exceeds the Twitter API's [rate limit](https://dev.twitter.com/docs/rate-limiting/1.1/limits), the crawler will pause itself and auto-restart later. | |||
| + | |||
| +It's quite stable for the things that I want to do; but I has been running some of the scripts for 15 days without many hiccups. | |||
| + | |||
| +One of the long term goal is to use [boto](http://boto.readthedocs.org/en/latest/) to integrate with the Amazon EC2 cluster so that you can run multiple crawlers to workaround Twitter's API rate limit. Helps are welcome! | |||
| + | |||
| +Installation | |||
| +------------ | |||
| + | |||
| +None... just clone this and start using it. It's not that complicated yet to have a setup.py.. | |||
| + | |||
| + git clone git://github.com/bianjiang/tweetf0rmer.git | |||
| + cd tweetf0rmer/scripts | |||
| + | |||
| +Dependencies | |||
| +------------ | |||
| +To run this, you will need: | |||
| +- [Twython](https://github.com/ryanmcgrath/twython) | |||
| +- [futures](https://pypi.python.org/pypi/futures) if you are on Python 2.7 | |||
| + | |||
| + | |||
| +Features | |||
| +------------ | |||
| + | |||
| +##### I am developing this for my own research, but feature requests or contributions are welcome for sure... | |||
| +##### If you see a problem, put in a ticket... | |||
| + | |||
| +Currently, three different scripts are provided (to meet my own needs); and they are all under the ``scripts`` folder. | |||
| + | |||
| +- Available scripts: | |||
| + - ``track_keywords.py``: Track a list of keywords (up to 4,000, as limited by Twitter API); and streaming all the Tweets that are related to these keywords; see Twitter API doc [status/filter](https://dev.twitter.com/docs/api/1.1/post/statuses/filter) | |||
| + - ``crawl_user_networks.py``: Starting from a list of ``seed`` users, this script will go out and find all their ``friends`` (or ``follower`` based-on setting) and their friends' friends until it reaches certain ``depth``. This is often used to create a friendship network for network analysis. | |||
| + - ``crawl_user_timelines.py``: This crawls a user's most recent tweets (up to 3,200, as limited by Twitter API). | |||
| + - ``twitter_crawler.py``: This basically combines ``crawl_user_networks.py`` and ``crawl_user_timelines.py``, so it will create the friendship network while crawling all the tweets from users in the network. | |||
| + | |||
| +##### I haven't tested Python 3 yet... | |||
| + | |||
| + | |||
| +How to use | |||
| +------------ | |||
| + | |||
| +First, you'll want to login the twitter dev site and create an applciation at https://dev.twitter.com/apps to have access to the Twitter API! | |||
| + | |||
| +After you register, create an access token and grab your applications ``Consumer Key``, ``Consumer Secret``, ``Access token`` and ``Access token secret`` from the OAuth tool tab. Put these information into a ``apikeys.json`` in the following format. | |||
| + | |||
| + | |||
| + { | |||
| + "i0mf0rmer" :{ | |||
| + "app_key":"CONSUMER_KEY", | |||
| + "app_secret":"CONSUMER_SECRET", | |||
| + "oauth_token":"ACCESS_TOKEN", | |||
| + "oauth_token_secret":"ACCESS_TOKEN_SECRET" | |||
| + } | |||
| + } | |||
| + | |||
| +The rest are fairly straigtforward, you can try to run e.g., ``python crawl_user_timelines.py --help`` to get help information about the parameters of each script. | |||
| + | |||
| + | |||
| + $python crawl_user_timelines.py --help | |||
| + usage: crawl_user_timelines.py [-h] -a APIKEYS -c CRAWLER -s SEEDS -o OUTPUT | |||
| + | |||
| + optional arguments: | |||
| + -h, --help show this help message and exit | |||
| + -a APIKEYS, --apikeys APIKEYS | |||
| + config file for twitter api key (json format) | |||
| + -c CRAWLER, --crawler CRAWLER | |||
| + the crawler identifier; you can have multiple crawler | |||
| + accounts set in the apikeys.json; pick one | |||
| + -s SEEDS, --seeds SEEDS | |||
| + the list of users you want to crawl their timelines; | |||
| + see crawl_user_timelines.json as an example | |||
| + -o OUTPUT, --output OUTPUT | |||
| + define the location of the output (each user's | |||
| + timeline will be in its own file under this output | |||
| + folder identified by the user id | |||
| + | |||
| + | |||
| +### License | |||
| +------------ | |||
| + | |||
| +The MIT License (MIT) | |||
| +Copyright (c) 2013 Jiang Bian (ji0ng.bi0n@gmail.com) | |||
| + | |||
| +Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
| +this software and associated documentation files (the "Software"), to deal in | |||
| +the Software without restriction, including without limitation the rights to | |||
| +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | |||
| +the Software, and to permit persons to whom the Software is furnished to do so, | |||
| +subject to the following conditions: | |||
| + | |||
| +The above copyright notice and this permission notice shall be included in all | |||
| +copies or substantial portions of the Software. | |||
| + | |||
| +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | |||
| +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | |||
| +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | |||
| +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |||
| +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
| @@ -0,0 +1,7 @@ | |||
| +#!/bin/bash | |||
| + | |||
| +#launchctl load ~/Library/LaunchAgents/homebrew.mxcl.redis.plist | |||
| +#launchctl load ~/Library/LaunchAgents/homebrew.mxcl.mongodb.plist | |||
| + | |||
| +mongod | |||
| +redis-server /usr/local/etc/redis.conf | |||
28
config.json
| @@ -0,0 +1,28 @@ | |||
| +{ | |||
| + "apikeys": { | |||
| + "i0mf0rmer01" :{ | |||
| + "app_key":"YZ43AjRJVEEu7vxj4k5Zg", | |||
| + "app_secret":"iJUmmTgPm4ZU7gxNCazatxx3VC5oRzUEI2BOLjgED0", | |||
| + "oauth_token":"1936968966-2avAkF0e2LClKsDf1AQS9o6x3sQkMHYg41ovRMm", | |||
| + "oauth_token_secret":"FUI816A2A1sSpL8q4Y53yqiTLHVQgcSh3WIHnNUmpA" | |||
| + }, | |||
| + "i0mf0rmer02" :{ | |||
| + "app_key":"QFzw7GJ8Oiz1ue0wUn1ZQg", | |||
| + "app_secret":"TyiAt3zuQbgqrFAdmXJl2G0zdT05TuiAhTdwzXH0", | |||
| + "oauth_token":"1946931948-P66BhnJJw3gti8v30LeSaz7yGC2mP8G0VcCj05s", | |||
| + "oauth_token_secret":"EhPhP0xXETxzuh6pS2hfupJg4nGhZZnzRWDo6X0mE" | |||
| + }, | |||
| + "i0mf0rmer03" :{ | |||
| + "app_key":"UREFgTupN8euVwMLXxRGKg", | |||
| + "app_secret":"FUxHixvkmk4Lxux64DBva1obfkAh10UuMxwXOD8", | |||
| + "oauth_token":"1948122342-uMqlGB7t8xXLcM69CrO9KWiABVdMtW8MKAKAPTv", | |||
| + "oauth_token_secret":"papLkhKwUKS2qZzIYYeKfmDNIxvX4QJHk7kJTIykPdbNx" | |||
| + } | |||
| + }, | |||
| + "redis": { | |||
| + "host": "localhost", | |||
| + "port": 6379, | |||
| + "db": 0, | |||
| + "password": "wh0tever" | |||
| + } | |||
| +} | |||
| @@ -0,0 +1,3 @@ | |||
| +{ | |||
| + "proxies":["58.20.127.100:3128", "58.20.223.230:3128", "210.22.63.90:8080", "211.167.112.14:82", "59.47.43.65:8080", "59.47.43.66:8080", "59.47.43.67:8080", "59.47.43.92:8080", "59.47.43.95:8080", "210.22.63.76:8080", "59.47.43.94:8080", "120.82.10.37:8080", "120.82.10.54:8080", "59.47.43.90:8080", "120.82.10.30:8080", "59.47.43.93:8080", "59.48.143.6:3128", "61.135.179.167:8080", "120.82.10.82:8080", "210.22.63.87:8080", "120.198.230.63:81", "210.22.63.79:8080", "218.108.232.187:80", "218.108.232.190:80", "59.47.43.88:8080", "116.231.213.245:8080", "222.87.129.29:80", "210.22.63.73:8080", "221.238.28.158:8081", "60.28.183.5:8081", "210.22.59.66:3128", "210.22.63.72:8080", "59.47.43.89:8080", "202.202.0.163:3128", "218.200.66.226:80", "218.200.66.234:80", "122.96.59.102:80", "120.82.10.25:8080", "116.255.241.111:808", "59.47.43.64:8080", "221.10.40.236:80", "221.10.40.237:80", "221.10.40.238:80", "218.104.148.59:3128", "221.10.40.232:82", "210.22.63.91:8080", "210.22.63.92:8080", "210.22.63.71:8080", "118.26.57.13:82", "59.47.43.91:8080", "221.10.102.199:82", "120.82.10.93:8080", "120.82.10.96:8080", "114.80.136.112:7780", "120.82.10.87:8080", "120.82.10.89:8080", "120.82.10.90:8080", "120.82.10.94:8080", "120.82.10.85:8080", "120.82.10.78:8080", "120.82.10.8:8080", "120.82.10.80:8080", "120.82.10.81:8080", "120.82.10.83:8080", "120.82.10.84:8080", "120.82.10.61:8080", "120.82.10.62:8080", "120.82.10.63:8080", "120.82.10.64:8080", "120.82.10.65:8080", "120.82.10.66:8080", "120.82.10.68:8080", "120.82.10.5:8080", "120.82.10.52:8080", "120.82.10.53:8080", "120.82.10.55:8080", "120.82.10.57:8080", "120.82.10.58:8080", "120.82.10.6:8080", "120.82.10.35:8080", "120.82.10.36:8080", "120.82.10.39:8080", "120.82.10.42:8080", "120.82.10.46:8080", "120.82.10.48:8080", "120.82.10.49:8080", "120.82.10.16:8080", "120.82.10.28:8080", "120.82.10.29:8080", "120.82.10.3:8080", "120.82.10.32:8080", "120.82.10.33:8080", "120.82.10.12:8080", "120.82.10.14:8080", "120.82.10.34:8080", "120.82.10.91:8080", "120.82.10.86:8080", "120.82.10.88:8080", "120.82.10.9:8080", "120.82.10.74:8080", "120.82.10.75:8080", "120.82.10.76:8080", "120.82.10.77:8080", "120.82.10.79:8080", "120.82.10.60:8080", "120.82.10.67:8080", "120.82.10.69:8080", "120.82.10.73:8080", "120.82.10.44:8080", "120.82.10.45:8080", "120.82.10.47:8080", "120.82.10.50:8080", "120.82.10.51:8080", "120.82.10.56:8080", "91.217.67.248:80", "120.82.10.38:8080", "120.82.10.40:8080", "120.82.10.41:8080", "120.82.10.43:8080", "122.96.59.106:81", "120.82.10.92:8080", "120.82.10.95:8080", "120.82.10.10:8080", "120.82.10.11:8080", "120.82.10.13:8080", "120.82.10.15:8080", "120.82.10.26:8080", "120.82.10.27:8080", "120.82.10.31:8080", "202.171.253.102:80", "218.207.195.206:80", "221.7.11.11:82", "218.108.232.188:80", "218.108.232.189:80", "120.82.10.4:8080", "119.233.255.24:82", "111.63.14.245:80", "115.25.216.6:80", "122.96.59.107:80", "120.202.249.230:80", "120.82.10.1:8080", "120.82.10.2:8080", "62.109.21.141:3128", "119.75.219.70:80", "213.175.174.94:80"] | |||
| +} | |||
| @@ -0,0 +1,8 @@ | |||
| +{ | |||
| + "i0mf0rmer" :{ | |||
| + "app_key":"CONSUMER_KEY", | |||
| + "app_secret":"CONSUMER_SECRET", | |||
| + "oauth_token":"ACCESS_TOKEN", | |||
| + "oauth_token_secret":"ACCESS_TOKEN_SECRET" | |||
| + } | |||
| +} | |||
| @@ -0,0 +1,4 @@ | |||
| +{ | |||
| + "seeds":["BarackObama"], | |||
| + "depth": 3 | |||
| +} | |||
Oops, something went wrong.
0 comments on commit
b8beab7