In [1]:
import os
import csv
import re
from pymongo import MongoClient
from pprint import pprint

# Credentials

In [2]:
pw_file = 'credentials/mongo_pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to MongoDB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

# get reference to  resume_db
db = client['github_db']

# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Show MongoDB collections

In [4]:
def show_collections():
    '''
    IN: None
    RETURN: Printed output of collections and collection counts
    '''
    for name in sorted(db.collection_names()):
        print('Documents in \"{0}\" collection: {1}'.format(name, db[name].count()))

In [5]:
show_collections()

Documents in "git_repos_contributors" collection: 2196
Documents in "git_repos_docs" collection: 96758
Documents in "git_repos_meta" collection: 2195
Documents in "git_repos_subscribers" collection: 2196
Documents in "git_users_followers" collection: 626
Documents in "git_users_following" collection: 626
Documents in "git_users_meta" collection: 626


In [6]:
def get_fieldname_tsv(col):
    cursor = db[col].find_one()
    fields = list(cursor.keys())
    fields = '\t '.join(fields) + '\n'
    
    if col == 'git_repos_docs':
        fields = fields.replace('file_contents', 'strip_contents')

    return fields

In [7]:
get_fieldname_tsv('git_repos_docs')

'strip_contents\t repo_id\t doc_name\t file_name\t _id\t file_extension\t repo_fullname\t repo_name\t repo_owner_login\n'

In [10]:
def convert_repodoc_tsv(doc):
    doc['file_contents'] = doc['file_contents'].replace('\n', ' ')
    doc['file_contents'] = doc['file_contents'].replace('\t', ' ')
    doc['file_contents'] = re.sub('\s+', ' ', doc['file_contents']).strip()
    
    doc_str = [str(v) for k,v in doc.items()]
    doc_str = '\t '.join(doc_str) + '\n'
    return doc_str

In [11]:
cursor = db['git_repos_docs'].find_one()
convert_repodoc_tsv(cursor)

"## What is this Python project? Describe features. ## What's the difference between this Python project and similar ones? Enumerate comparisons. -- Anyone who agrees with this pull request could vote for it by adding a :+1: to it, and usually, the maintainer will merge it when votes reach **20**.\t 21289110\t 21289110__PULL_REQUEST_TEMPLATE.md\t PULL_REQUEST_TEMPLATE.md\t 57ce15d48532183ad7e11e84\t md\t vinta/awesome-python\t awesome-python\t vinta\n"

In [12]:
def filter_filetypes(db, col, _filter='all'):
    
    if _filter != 'all':
        db_new = db[col].find({'file_extension': re.compile(_filter, re.IGNORECASE)})
    else:
        db_new = db[col].find()
    
    return db_new

In [13]:
import gzip
import shutil

In [22]:
def export_mongo_to_gztsv(db, col, filename, _filter='all'):
    
    cursor = filter_filetypes(db, col, _filter)
    ct = 0
    
    with open('{0}.txt'.format(filename), 'w') as outfile:
        #if ct < 1:
        #    outfile.write(get_fieldname_tsv(col))

        for doc in cursor:
            csv_str = convert_repodoc_tsv(doc)
            outfile.write(csv_str)
            
        with open('{0}.txt'.format(filename), 'rb') as infile:
            with gzip.open('{0}.txt.gz'.format(filename), 'wb') as outfile:
                shutil.copyfileobj(infile, outfile)

In [23]:
export_mongo_to_gztsv(db, 'git_repos_docs', 'py_docs', 'py')

# Upload to S3

In [17]:
with open('credentials/aws.csv', 'r') as infile:
    _ = infile.readline()
    username, access_key, secret_key = infile.readline().replace('"', '').split(',')

In [19]:
secret_key

'BM09rT1gGGq4ulonuIVjgcRT3dB/homCZU5k8fq7'

In [214]:
import tinys3

conn = tinys3.Connection(access_key, secret_key, tls=True)

f = open('repo_py_docs.txt','rb')
conn.upload('repo_py_docs.txt',f,'github-spark')

<Response [200]>

In [18]:
import pandas as pd
df = pd.DataFrame(list(db['git_repos_docs'].find()))

In [23]:
#del df['_id']
#del df['doc_name']
df.head()

Unnamed: 0,file_contents,file_extension,file_name,repo_fullname,repo_id,repo_name,repo_owner_login
0,## What is this Python project?\n\nDescribe fe...,md,PULL_REQUEST_TEMPLATE.md,vinta/awesome-python,21289110,awesome-python,vinta
1,# Contributing\n\nYour contributions are alway...,md,CONTRIBUTING.md,vinta/awesome-python,21289110,awesome-python,vinta
2,# Awesome Python [![Awesome](https://cdn.rawgi...,md,README.md,vinta/awesome-python,21289110,awesome-python,vinta
3,"# coding: utf-8\n\n""""""\n The approach taken...",py,sort.py,vinta/awesome-python,21289110,awesome-python,vinta
4,Requests is written and maintained by Kenneth ...,rst,AUTHORS.rst,kennethreitz/requests,1362490,requests,kennethreitz


In [24]:
df = df[df['file_extension'] == 'py']
df = df[['file_name', 'file_extension', 'file_contents', 'repo_name', 'repo_id', 'repo_fullname', 'repo_owner_login']]

In [28]:
a = df.head()

In [32]:
b = df.iloc[0]['file_contents']

In [29]:
a.to_csv('test.csv')

In [33]:
with open('my.py', 'w') as f:
    f.write(b)