Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Scraper status

  • Loading branch information...
commit 0b57d8dce97b4973048dbd91ef74db691c0d63ba 1 parent 664e76d
Abe Stanway authored
View
2  .gitignore
@@ -1,5 +1,7 @@
opendb.php
key.p12
+dbauth.py
+dbauth.pyc
# OS X Specific
.DS_Store
View
BIN  apiclient/__init__.pyc
Binary file not shown
View
BIN  apiclient/discovery.pyc
Binary file not shown
View
BIN  apiclient/errors.pyc
Binary file not shown
View
BIN  apiclient/http.pyc
Binary file not shown
View
BIN  apiclient/mimeparse.pyc
Binary file not shown
View
BIN  apiclient/model.pyc
Binary file not shown
View
BIN  apiclient/schema.pyc
Binary file not shown
View
BIN  oauth2client/__init__.pyc
Binary file not shown
View
BIN  oauth2client/anyjson.pyc
Binary file not shown
View
11 oauth2client/client.py
@@ -34,13 +34,10 @@
from anyjson import simplejson
HAS_OPENSSL = False
-try:
- from oauth2client.crypt import Signer
- from oauth2client.crypt import make_signed_jwt
- from oauth2client.crypt import verify_signed_jwt_with_certs
- HAS_OPENSSL = True
-except ImportError:
- pass
+from oauth2client.crypt import Signer
+from oauth2client.crypt import make_signed_jwt
+from oauth2client.crypt import verify_signed_jwt_with_certs
+HAS_OPENSSL = True
try:
from urlparse import parse_qsl
View
BIN  oauth2client/client.pyc
Binary file not shown
View
BIN  oauth2client/clientsecrets.pyc
Binary file not shown
View
BIN  oauth2client/crypt.pyc
Binary file not shown
View
20 scrape.py
@@ -1,16 +1,20 @@
import httplib2
import pprint
import sys
-from BeautifulSoup import BeautifulSoup
-import requests
+from bs4 import BeautifulSoup
+import urllib2
import simplejson
+import MySQLdb
+import dbauth
+cursor = dbauth.db.cursor()
from apiclient.discovery import build
from oauth2client.client import SignedJwtAssertionCredentials
def find_avatar(username):
- r = requests.get('https://github.com/' + username)
- soup = BeautifulSoup(r.text)
+ r = urllib2.urlopen('https://github.com/' + username)
+ body = r.read()
+ soup = BeautifulSoup(body)
for img in soup.find_all("img"):
try:
if img.get('src').index('gravatar') > 0:
@@ -24,8 +28,14 @@ def printTableData(data, startIndex):
for cell in row['f']:
rowVal.append(cell['v'])
avatar = find_avatar(row['f'][3]['v'])
- print avatar
+ userurl = "https://github.com/" + row['f'][3]['v']
print 'Row %d: %s' % (startIndex, rowVal)
+ query = "INSERT INTO new_commits VALUES ('', \'%s\', \'%s\', \'%s\', \'%s\', \'%s\', \'%s\', '')" % (row['f'][3]['v'], row['f'][2]['v'], avatar, row['f'][1]['v'], userurl, row['f'][0]['v'])
+ print query
+ try:
+ cursor.execute(query)
+ except:
+ pass
startIndex +=1
View
BIN  uritemplate/__init__.pyc
Binary file not shown
View
72 view-logs.php
@@ -1,72 +0,0 @@
-<?php
- require("opendb.php");
-
- $query = mysql_query("SELECT * FROM users ORDER BY id");
- $usernames = array();
-
- while($row = mysql_fetch_array($query)){
- array_push($usernames, $row[1]);
- }
-
- mysql_close($connection);
-
- $repos = array('ios', 'php', 'java', 'ruby', 'python', 'c', 'scala', 'javascript');
-
- foreach($repos as $repo){
- $users = "https://github.com/api/v2/json/repos/search/" . $repo;
- $users = json_decode(file_get_contents($users));
- $rBROsitories = $users->repositories;
- foreach($rBROsitories as $key=>$reBRO){
- $username = $reBRO->username;
- array_push($usernames, $username);
- }
- }
-
- //an attempt to get all the data over time despite github's rate limiting.
- shuffle($usernames);
-
- foreach($usernames as $username){
- process($username, $db_ip, $db_user, $db_pass, $db_name);
- }
-
-function process($username, $db_ip, $db_user, $db_pass, $db_name){
- $repo_url = "https://api.github.com/users/" . $username . "/repos";
- $repos = json_decode(file_get_contents($repo_url));
- $messages = array();
- foreach($repos as $key => $repo) {
- $name = $repo->name;
- $commit_url = "https://api.github.com/repos/" . $username . "/" . $name . "/commits";
- $commits = json_decode(file_get_contents($commit_url));
- foreach($commits as $key => $commit) {
- try{
- $message = $commit->commit->message;
- } catch (Exception $e){
- continue;
- }
- $profanity_url = "http://www.wdyl.com/profanity?q=" . urlencode($message);
- $profanity = json_decode(file_get_contents($profanity_url));
-
- if($profanity->response == 'true') {
- $login = $commit->committer->login;
- $avatar = $commit->committer->avatar_url;
- $userurl = $commit->committer->url;
- $userurl = str_replace('api.','', $userurl);
- $userurl = str_replace('users/', '', $userurl);
- $date = $commit->commit->committer->date;
- $commiturl = $commit->commit->url;
- $commiturl = str_replace('api.','',$commiturl);
- $commiturl = str_replace('repos/','',$commiturl);
- $commiturl = str_replace('git/commits','commit',$commiturl);
-
- //hook it up
- $connection = mysql_connect($db_ip, $db_user, $db_pass);
- mysql_select_db($db_name) or die ('Unable to select database!');
- $message = filter_that_shit($message);
- $insert = "INSERT INTO new_commits VALUES ('', '$login', '$message', '$avatar', '$commiturl', '$userurl', '$date', now())";
- $insert = mysql_query($insert);
- mysql_close($connection);
- }
- }
- }
-}
-?>
Please sign in to comment.
Something went wrong with that request. Please try again.