In [1]:
import sys
sys.path.append('./../src')

In [122]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import datetime
import re
from importlib import reload
from pymongo import MongoClient
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

%config InlineBackend.figure_format = 'svg'
%matplotlib inline
sns.set(color_codes=True)
plt.style.use('seaborn-colorblind')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 2)

## Downloading Data from MongoDB ##

In [171]:
with open('../pickles/raw_gd_corpus.pickle', 'wb') as to_write:
    pickle.dump(gd_corpus, to_write)

In [172]:
with open('../pickles/raw_g3s_corpus.pickle', 'wb') as to_write:
    pickle.dump(gd_corpus, to_write)

In [69]:
client = MongoClient()
db = client.blogs
db.list_collection_names()

['g3s', 'gd']

In [70]:
cursor = db.gd.find({}, {'_id': 0, 'text': 1})

In [71]:
# Store all gd texts (queried from mongodb) in a list of strings 
gd_corpus = [result['text'] for result in cursor]    

In [72]:
cursor = db.g3s.find({}, {'_id': 0, 'text': 1})

In [73]:
# Store all gd texts (queried from mongodb) in a list of strings 
g3s_corpus = [result['text'] for result in cursor]  

In [74]:
g3s_corpus[150]

'\n\nFollower Fridays is a series of profiles highlighting members of Gaysian Third Space to showcase the diversity of gaysians in the Community. This week’s featured member is @beat0t.\n\n\n\nWho are you?\n\n\n\nMy name is Bea. I’m a recent college graduate, co-founder of a dance company based in Philadelphia called D2D: Dare To Dance. \n\n\n\nWhere are you from?\n\n\n\nI was born and raised in the island of Leyte, Philippines. Lived there most of my life before moving to the States when I was 12.\n\n\n\nWhat do you do?\n\n\n\nI’m you’re typical post grad, looking to jump start their career. I also teach dance at a local dance studio in Philadelphia\n\n\n\nWhat are you passionate about?\n\n\n\nI truly love dancing and I want to use dance as an avenue to change the world, no matter how small that change may be. I am also passionate about my Pilipinx identity. Little by little, I’m learning more and more about being a Pilipinx, I fall more and more in love about my heritage and my ident

In [75]:
count_follower_fridays = 0
for text in g3s_corpus:
    if 'Follower Fridays' in text:
        count_follower_fridays += 1
count_follower_fridays

139

In [76]:
count_come_out = 0
for text in g3s_corpus:
    if 'come out' in text:
        count_come_out += 1
count_come_out

18

In [77]:
count_coming_out = 0
for text in g3s_corpus:
    if 'coming out' in text:
        count_coming_out += 1
count_coming_out

20

In [124]:
count_coming_out = 0
for text in gd_corpus:
    if 'coming out' in text:
        count_coming_out += 1
count_coming_out

6

## Text Preprocessing ##

In [147]:
# Removed 'out' from sklearn stop_words list since 'coming out' is an important term in LGBT texts
from sklearn.feature_extraction import text
custom_stop_words = text.ENGLISH_STOP_WORDS.difference({'out'})

In [112]:
# Remove numbers from corpora
clean_gd_corpus = [re.sub('\w*\d+\w*', '', text) for text in gd_corpus]
clean_g3s_corpus = [re.sub('\w*\d+\w*', '', text) for text in g3s_corpus]

In [113]:
clean_g3s_corpus[0]

'Follower Fridays is a series of profiles highlighting members of Gaysian Third Space to showcase the diversity of gaysians in the Community. This week’s featured member is @dandypandy.\n\n\n\n\n\nWho are you?\n\n\n\nMy identity is in flux at this time of my life as I try to reconcile future aspirations and current actions. I am a second generation Korean American who sometimes forgets to live a little.\n\n\n\nWhere are you from?\n\n\n\nI grew up in the suburbs near LA and am now going to school in San Diego.\n\n\n\nWhat do you do?\n\n\n\nI study mechanical engineering, co-coordinate a student organization, volunteer in river clean-ups, and work with GIS. In my free time I like to take pictures, eat froyo, and dance (after having a few drinks).\n\n\n\nWhat are you passionate about?\n\n\n\nMy recent time in school has allowed me to explore my interests and discover what I’m passionate about, namely the environment, education, and service. Nature provides personal enjoyment for me and is

## Testing CountVectorizer and TfidfVectorizer ##

In [68]:
wordnet_lemmatizer = WordNetLemmatizer()
# words = ['coming', 'come']
wordnet_lemmatizer.lemmatize('house', 'v')
wordnet_lemmatizer.lemmatize('home', 'v')


'house'

'home'

In [166]:
cv = CountVectorizer(stop_words=custom_stop_words, ngram_range=(1, 2), min_df=3)
X = cv.fit_transform(clean_gd_corpus).toarray()

In [167]:
X.shape

(34, 1445)

In [168]:
gd_doc_term = pd.DataFrame(X, columns=cv.get_feature_names())

In [169]:
gd_doc_term.head()

Unnamed: 0,ability,able,abroad,absolutely,abuse,accept,acceptance,accepted,act,act like,action,actions,actively,activism,activities,actual,actually,actually told,add,additional,address,advantage,affect,afraid,age,ago,alcohol,allow,allowed,allows,amazing,america,american,american culture,americans,angeles,angry,announced,answer,answers,anxiety,anxious,anymore,api,app,appearance,appreciate,appreciation,apps,apps like,april,area,arm,arms,arrived,ashamed,asia,asian,asian american,asian americans,asian friends,asian guy,asian male,asian pacific,asians,ask,asked,asking,ass,attend,attended,attending,attention,attracted,attraction,attractions,authentic,author,avoid,avoided,aware,away,bad,badly,balance,bar,barely,based,basically,bathroom,bay,beat,beautiful,beauty,bed,began,beginning,behaviors,believe,believed,believing,belong,best,best friend,best way,bet,better,better did,better life,big,bigger,biggest,birth,birthday,bit,bitter,black,blame,board,bodies,body,born,bother,bought,bound,boundaries,boy,boyfriend,boys,break,breath,bridge,bring,brothers,brought,build,building,built,burden,business,busy,buy,called,calm,came,came close,came out,cancer,car,card,care,career,case,casino,caught,cause,caused,center,centers,ceremony,certain,certainly,chair,challenge,challenges,chance,chances,change,changed,changing,chatted,child,childhood,children,china,chinese,choice,choose,chosen,christian,church,circle,city,class,classes,classmates,clean,clear,clearly,climate,close,close friends,closed,closer,closest,closet,closing,club,clubs,coast,cold,college,college came,college met,color,columbia,come,come out,comes,comfort,comfortable,coming,coming out,coming terms,committee,common,communities,community,company,complain,completely,complex,complicated,compounded,concept,conference,confidence,confident,conform,confront,confused,confusing,connect,connection,connections,conscious,consequence,consequences,conservative,consider,considered,considering,constantly,contact,context,continually,continue,continued,conversation,conversations,cool,corner,corners,couldn,counter,countless,countries,country,courage,course,create,created,cried,crisis,critical,cross,crying,cultural,culture,curious,current,currently,cut,dad,daily,damn,dance,dancing,dating,dating apps,day,day day,day like,days,deal,dealing,dear,dear diary,death,decide,decided,decision,decisions,dedicated,deep,deeply,defense,define,defined,definitely,definition,denied,deny,depressed,depression,desire,desires,desperately,despite,details,determined,develop,developed,developing,diaries,diary,diary entry,did,didn,didn feel,didn know,didn really,didn want,die,died,different,differently,difficult,dinner,directly,director,discovered,discovery,discrimination,discuss,distance,diversity,does,doesn,doing,don,don feel,don know,don think,don wrong,door,doors,double,double minority,doubt,drama,dramatic,draw,dream,dreamed,dress,drinks,drive,drunk,dying,eager,earlier,early,easier,easily,east,east coast,easy,eat,eating,educated,education,effort,efforts,embrace,emotional,emotionally,emotions,empower,end,ended,ending,energy,english,enjoy,enjoyed,enjoying,entire,entry,environment,erase,especially,esteem,ethnicity,evening,event,events,eventually,everyday,exact,exactly,excited,exercise,exhausted,exist,expect,expected,experience,experienced,experiences,explore,express,expressed,extra,extremely,eye,eyes,face,facebook,facing,fact,failed,failing,failure,faith,fall,falling,families,family,family members,far,fast,father,fault,favorite,fear,fears,feel,feel like,feeling,feelings,feels,feels like,fell,felt,felt like,fight,fighting,figure,figure out,filled,final,finally,finding,fine,finish,finished,fit,flew,floor,flow,focus,focused,folk,folks,follow,following,food,force,forced,foreign,foreign country,forever,forget,form,formed,fortunate,forward,fought,francisco,free,friend,friends,friends family,friendship,friendships,fuck,fully,fun,funny,future,gain,gained,game,games,gapimny,gapimny steering,gave,gay,gay asian,...,point,police,political,poor,porn,positive,possibility,possible,post,potential,power,powerful,prefer,press,pressure,pretty,pride,prior,prioritize,private,privilege,privileged,probably,problem,problems,process,professional,professor,program,programming,programs,progress,progressive,project,promised,promises,protection,proud,prove,provide,provided,public,pure,pushed,putting,queens,queer,queer asian,question,questions,quick,quickly,quiet,quietly,quite,race,racial,racism,racist,raise,raised,ran,random,range,rarely,reach,reach out,reached,read,reading,ready,real,reality,realize,realized,realizing,really,reason,reasons,reassured,received,recent,recently,recognized,reduced,reflect,reflection,refused,regret,rejection,rejections,relate,related,relationship,relationships,relatively,relatives,remember,remembered,reminded,repeatedly,reply,represented,request,resolution,resolve,respect,respond,responded,response,responses,responsibility,rest,restaurant,result,return,returned,ride,right,rights,rise,risk,risks,role,romantic,room,rules,run,running,ryan,sadly,sadness,safe,safe space,safety,said,san,san francisco,sat,saw,say,saying,scared,scene,school,school felt,second,seconds,seeing,seek,seen,self,self esteem,self love,sending,senior,senior year,sense,sensitive,sent,separated,series,seriously,serve,service,session,set,setting,seven,sex,sexual,sexuality,sexy,sf,shame,shape,shaped,share,share stories,shared,sharing,sharing story,shit,short,shortly,shot,showed,shut,signature,significant,silence,silent,similar,simple,simply,single,sister,sitting,situation,situations,skin,skinny,sleep,sleeping,slightly,slowly,small,smile,smiled,social,social anxiety,social media,socially,society,society large,soft,somewhat,soon,sophomore,sorry,soul,sound,sounded,sounds,source,space,spaces,speak,speaking,special,specific,spend,spent,spirit,spoke,sports,spot,stage,stand,standard,standing,stared,start,started,starting,starts,state,statement,states,status,stay,stayed,steering,steering committee,step,stick,stomach,stood,stop,stopped,store,stories,stories told,story,straight,street,streets,strength,stress,strong,stronger,struggle,struggled,struggles,struggling,stuck,students,study,stuff,stumbled,stupid,style,subject,success,successful,sudden,suddenly,suffer,suffering,summer,sun,super,support,supported,supportive,supposed,supposedly,sure,surely,surprise,surprised,survival,sweat,systems,tables,taken,takes,taking,talk,talked,talking,taught,teacher,teachers,team,tears,television,tell,telling,temporary,term,terms,text,thank,thankful,thanks,thing,things,things like,things make,think,thinking,thought,thoughts,throw,tight,till,time,time didn,time looking,times,times like,tinder,tired,today,told,took,topics,touch,touched,tough,town,toxic,track,traditional,traits,trans,transgender,trauma,traumas,travel,traveled,traveling,treat,treated,tried,tried best,trip,trouble,troubled,true,truly,trump,trust,trusted,truth,try,trying,trying best,turn,turned,turning,turning point,twice,type,types,ugly,ultimately,uncomfortable,understand,understanding,understood,unfair,unfortunately,unhealthy,unique,united,united states,university,unknown,unwanted,ups,urge,use,used,using,usually,validate,validated,value,values,ve,ve come,ve felt,ve gotten,video,vietnamese,vietnamese american,view,violence,violent,visit,visited,voice,void,vulnerable,wait,waiting,wake,walk,walked,walking,want,wanted,wants,war,warm,wasn,watching,water,way,way out,ways,wearing,week,weekend,weeks,weight,weird,welcome,went,went out,weren,west,western,white,whitewashing,wide,windows,winter,wish,wished,wishes,woke,woman,women,won,wonder,wondered,word,words,work,work out,worked,working,workplace,world,worries,worry,worrying,worse,worth,wouldn,wouldn able,write,writing,written,wrong,wrote,year,year ago,year high,years,years ago,years old,yes,york,york city,young,younger
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,4,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,2,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,1,8,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,1,2,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0,0,0,0,6,0,0,0,0,0,2,2,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,2,1,0,0,2,2,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,1,1,1,0,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,3,1,0,1,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,6,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,1,0,0,0,0,0,7,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,1,0,0
2,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,2,0,0,0,9,5,0,1,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,3,0,0,0,1,0,0,2,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,3,1,1,0,0,0,0,1,0,1,0,0,0,0,0,1,2,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5,1,0,0,0,0,0,0,0,0,0,0,1,5,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,5,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,1,4,2,1,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,1,0,0,0,0,1,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,6,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,4,0,4,4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,1,2,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0
3,1,1,0,0,1,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,2,1,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,4,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,2,0,1,0,0,0,3,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,1,0,0,0,...,2,0,0,0,0,0,0,0,1,0,2,1,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,2,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,1,1,0,2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,1,6,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,5,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,2,0,0,0,0,0,5,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,1,0,1,2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,2,0,1,0,0,0,1,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,3,1,0,0,1,0,0,0,0,0,0,3,0,0,0,0,1,0,0,0,1,0,0,0,2,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,4,3,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,1,0,3,0,0,1,2,0,0,1,0,1,1,0,1,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,1,6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,2,0,2,0,1,2,3,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,1,0,0,0,1,0,0,0,0,0,18,3,...,0,0,0,0,1,0,0,2,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,4,0,0,0,0,0,0,0,0,0,0,1,0,1,0,2,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,2,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,7,4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,5,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,3,1,5,0,0,0,0,3,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,2,2,0,1,0,0,0,0,0,0,0,0,1,0,3,0,0,1,1,0,0,0,0,0,0,1,0,1,3,1,3,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,1,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,1,0,0,0,0,0,6,0,0,0,0,0,1,0,0,0,1,0,0,2,0,1,3,0,0,1,0,0,0,0


In [161]:
'to' in custom_stop_words

True

In [160]:
clean_gd_corpus[28]

'Dear Diary,\n\nI feel like I’ve been out forever, but it hasn’t been that long. When I started writing this entry on\xa0April , , I had come out exactly three years ago.\n\nI was nineteen, a sophomore in college, when I came out to my first friend: he was a good\xa0college friend of mine who, like me, was gay and Vietnamese American. It was a Tuesday night when it happened.\xa0I had asked my friend earlier if we could talk at a quiet café on the edge of campus. I didn’t mention to him what it was about. As I walked to the café, I clutched an index\xa0card inside my jacket’s pocket as if everything depended on it.\n\nDays before, I had been deliberating about how I should come out to my friend. I was afraid\xa0that I wouldn’t be able to say the words to him when the time came, so I compromised: I’ll\xa0write “I’m gay” on an index card and give it to him—simple as that. But when I wrote the index\xa0card the night before, I decided I didn’t want to just write “I’m gay”. Instead, I wrote

In [170]:
gd_doc_term['\xa0']

KeyError: '\xa0'