In [1]:
import os
import csv
import re
import tinys3
import json
import pandas as pd
from bson.json_util import dumps
from pymongo import MongoClient
from pprint import pprint

# Credentials

In [2]:
pw_file = 'credentials/mongo_pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to MongoDB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

# get reference to  github_db
db = client['github_db']

# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Show MongoDB collections

In [4]:
def show_collections(col_type='all'):
    '''
    IN: None
    RETURN: Printed output of collections and collection counts
    '''
    for name in sorted(db.collection_names()):
        if col_type == 'backup':
            if 'backup' in name:
                print('Documents in \"{0}\" collection: {1}'.format(name, db[name].count()))
        elif col_type == 'original':
            if 'backup' not in name:
                print('Documents in \"{0}\" collection: {1}'.format(name, db[name].count()))
        else:
            print('Documents in \"{0}\" collection: {1}'.format(name, db[name].count()))

In [5]:
show_collections('all')
# db.drop_collection('git_repos_contributors_backup')
# db.drop_collection('git_repos_docs_backup')
# db.drop_collection('git_repos_meta_backup')
# db.drop_collection('git_repos_subscribers_backup')

# db.drop_collection('git_users_followers_backup')
# db.drop_collection('git_users_following_backup')
# db.drop_collection('git_users_meta_backup')

Documents in "git_repos_contributors" collection: 3074
Documents in "git_repos_contributors_backup" collection: 3030
Documents in "git_repos_docs" collection: 146833
Documents in "git_repos_docs_backup" collection: 146685
Documents in "git_repos_meta" collection: 3073
Documents in "git_repos_meta_backup" collection: 3029
Documents in "git_repos_subscribers" collection: 3074
Documents in "git_repos_subscribers_backup" collection: 3030
Documents in "git_users_followers" collection: 1398
Documents in "git_users_followers_backup" collection: 1364
Documents in "git_users_following" collection: 1398
Documents in "git_users_following_backup" collection: 1364
Documents in "git_users_meta" collection: 1397
Documents in "git_users_meta_backup" collection: 1363


# Backup

In [6]:
def backup(db):
    original_names = [x for x in db.collection_names() if 'backup' not in x]
    
    for name in original_names:
        cursor = db[name].find()
        try:
            db['{0}_backup'.format(name)].insert_many(list(cursor))
        except:
            pass

In [7]:
# %time backup(db)

# Filter based on file extension

In [8]:
def filter_filetypes(db, col, _filter='all'):

    if _filter != 'all':
        db_new = db[col].find({'file_extension': re.compile(_filter, re.IGNORECASE)}, 
                              {'_id': False, 'file_name':1, 'file_extension':1, 'file_contents':1, 'repo_name':1, 
                               'repo_id':1, 'repo_fullname':1, 'repo_owner_login':1})
    else:
        db_new = db[col].find({}, {'_id': False, 'file_name':1, 'file_extension':1, 'file_contents':1, 'repo_name':1, 
                               'repo_id':1, 'repo_fullname':1, 'repo_owner_login':1})
    
    return db_new

# Clean file contents

In [9]:
def clean_py(doc):
    string = doc['file_contents']
    string = re.sub(re.compile('""".*?"""',re.DOTALL) ,"" ,string)
    string = re.sub(re.compile('#.*?\n' ) ,' ' ,string)
    string = re.sub(re.compile('".*?"',re.DOTALL ) ,"" ,string)
    string = re.sub(re.compile("'.*?'",re.DOTALL ) ,"" ,string)
    string = string.replace('\\n', ' ')
    string = string.replace('\\t', ' ')
    string = re.sub('\\s+', ' ', string).strip()
    
    chrs = "[]{}"
    for ch in chrs:
        string = string.replace(ch, ' ')
        
    return string

In [10]:
def clean_md(doc):
    x = doc['file_contents'].replace('\\n', ' ')
    x = x.replace('\t', ' ')
    x = x.replace("\'", ' ')
    x = x.replace('"', ' ')
    
    # remove hyperlinks
    reg = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    x = re.sub(reg, ' ', x).strip()

    x = re.sub(re.compile('<.*?>',re.DOTALL ) ,"" ,x)
    
    x = re.sub('[^0-9a-zA-Z]+', ' ', x)
    
    chrs = "0123456789"
    for ch in chrs:
        x = x.replace(ch, ' ')
        
    x = re.sub('\\s+', ' ', x).strip()
    return x

In [11]:
def clean_rst(doc):
    x = doc['file_contents'].replace('\\n', ' ')
    x = x.replace('\t', ' ')
    x = x.replace("\'", ' ')
    x = x.replace('"', ' ')
    
    # remove hyperlinks
    reg = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    x = re.sub(reg, ' ', x).strip()

    x = re.sub(re.compile('<.*?>',re.DOTALL ) ,"" ,x)
    
    x = re.sub('[^a-zA-Z]+', ' ', x)
    
    chrs = "-`*+=~"
    for ch in chrs:
        x = x.replace(ch, ' ')
        
    x = re.sub('\\s+', ' ', x).strip()
    return x

# Clean all files

In [12]:
def clean_script_content(db):
    col = db['git_repos_docs_backup']
    cursor = col.find()

    for doc in cursor:
        ext = doc['file_extension']
#         if ext == 'md':
#             col.update_one(
#                     {'doc_name' : doc['doc_name'], 'file_extension': 'md'}, 
#                     {'$set':{
#                             'file_contents': clean_md(doc),}
#                         }, 
#                     upsert=False)
#         elif ext == 'rst':
#             col.update_one(
#                     {'doc_name' : doc['doc_name'], 'file_extension': 'rst'}, 
#                     {'$set':{
#                             'file_contents': clean_rst(doc),}
#                         }, 
#                     upsert=False)
        if ext =='SKIP':
            pass
        else:
            col.update_one(
                    {'doc_name' : doc['doc_name']}, 
                    {'$set':{
                            'file_contents': clean_py(doc),}
                        }, 
                    upsert=False)

In [13]:
%time clean_script_content(db)

CPU times: user 2min 56s, sys: 56.1 s, total: 3min 52s
Wall time: 3h 29min 11s


# Convert rst READMES to md extension

In [None]:
def mark_readmes_md(db):
    col = db['git_repos_docs_backup']

    readme_files = col.find({'file_name' : {'$regex' : '.*README.*'}, 'file_extension' : 'rst'})

    for doc in readme_files:
            col.update_one(
                    {'doc_name' : doc['doc_name']}, 
                    {'$set':{
                            'file_extension': 'md',}
                        }, 
                    upsert=False)

# Remove short entries

In [14]:
def remove_short_contents(min_len=50, db=db):
    py_ct = 0
    rst_ct = 0
    md_ct = 0
    a = db['git_repos_docs_backup'].find()

    for i in a:
        if len(i['file_contents']) < min_len:
            if i['file_extension'] =='py':
                py_ct += 1
            if i['file_extension'] =='rst':
                rst_ct += 1
            if i['file_extension'] =='md':
                md_ct += 1
            db['git_repos_docs_backup'].delete_one({'_id': i['_id']})
    print('{0} .py records deleted'.format(py_ct))
    print('{0} .rst records deleted'.format(rst_ct))
    print('{0} .md records deleted'.format(md_ct))

In [15]:
%time remove_short_contents(min_len=50, db=db)

2882 .py records deleted
543 .rst records deleted
427 .md records deleted
CPU times: user 11.7 s, sys: 10.3 s, total: 22 s
Wall time: 1min 21s


# Upload to S3

In [16]:
def s3_upload(file):
    try:
        with open('credentials/aws.csv', 'r') as infile:
            _ = infile.readline()
            username, access_key, secret_key = infile.readline().replace('"', '').split(',')

        conn = tinys3.Connection(access_key, secret_key, tls=True)

        with open(file,'rb') as f:
            conn.upload(file,f,'github-spark')
    except:
        print('Upload failed')

# Convert queries into json files and upload to S3

In [25]:
def word2vec_content_json(filename, ext, db=db):
    
    cursor = db['git_repos_docs_backup'].find({'file_extension' : {'$regex' : '.*{0}.*'.format(ext)}},
                                             {'_id': False, 'file_contents':1})

    cursor = dumps(cursor)
    cur = cursor

    if os.path.exists(filename):
        os.remove(filename)

    with open(filename, 'a') as of:
        of.write(cur)
    
    # upload to s3
    s3_upload(filename)

In [26]:
%time word2vec_content_json('data/py_contents.json', 'py')
%time word2vec_content_json('data/rst_contents.json', 'rst')
%time word2vec_content_json('data/md_contents.json', 'md')

CPU times: user 18.1 s, sys: 9.73 s, total: 27.8 s
Wall time: 1min 6s
CPU times: user 2.77 s, sys: 1.6 s, total: 4.37 s
Wall time: 9.69 s
CPU times: user 1.33 s, sys: 645 ms, total: 1.98 s
Wall time: 6.28 s


# Pymongo to List

In [98]:
def pymongo_list(filename, ext, db=db):
    
    cursor = db['git_repos_docs_backup'].find({'file_extension' : {'$regex' : '.*{0}.*'.format(ext)}},
                                             {'_id': False, 'file_contents':1})

    lst = [i['file_contents'] for i in cursor]

    if os.path.exists(filename):
        os.remove(filename)

    with open(filename, 'w') as of:
        of.write('\n'.join(lst))
    
    # upload to s3
    s3_upload(filename)

In [99]:
%time pymongo_list('data/py_contents.txt', 'py')
%time pymongo_list('data/rst_contents.txt', 'rst')
%time pymongo_list('data/md_contents.txt', 'md')

CPU times: user 20 s, sys: 18.9 s, total: 38.9 s
Wall time: 2min 9s
CPU times: user 2.53 s, sys: 2.38 s, total: 4.91 s
Wall time: 16.7 s
CPU times: user 1.12 s, sys: 1.06 s, total: 2.18 s
Wall time: 8.04 s


# Number documents check

In [35]:
def mongo_count(db):
    vals = []
    files = ['py_contents', 'rst_contents', 'md_contents']
    for file in files:
        ext = file.split('_')[0]
        val = db['git_repos_docs_backup'].count({'file_extension' : {'$regex' : '.*{0}.*'.format(ext)}})
        vals.append(val)
        print('Mongo {0} count: {1}'.format(ext, val))
    return vals

In [36]:
def file_count(db):
    vals = []
    files = ['py_contents', 'rst_contents', 'md_contents']
    for file in files:
        with open('data/{0}.json'.format(file), 'r') as f:
            data = json.load(f)
        val = len(data)
        vals.append(val)
        print('File {0} count: {1}'.format(file.split('_')[0], val))
        data = None
    return vals

In [37]:
def file_loss_check(db):
    assert mongo_count(db) == file_count(db)

In [38]:
file_loss_check(db)

Mongo py count: 114828
Mongo rst count: 19374
Mongo md count: 8495
File py count: 114828
File rst count: 19374
File md count: 8495


# Try to extract just import statements

# Try to build graph network