In [1]:
from sklearn.decomposition import NMF
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_ca    = pd.read_csv('../project_5_second/cleaned/paid_ca.csv')
df_chile = pd.read_csv('../project_5_second/cleaned/paid_chile.csv')
df_nepal = pd.read_csv('../project_5_second/cleaned/paid_nepal.csv')
df_pak   = pd.read_csv('../project_5_second/cleaned/paid_pak.csv')

df_supra = pd.concat([df_ca, df_chile, df_nepal, df_pak], axis= 0)

In [3]:
def nmf_automate(df_input, quant_top_words, quant_components):
    
    tvec = TfidfVectorizer()
    tvec.fit(df_input['tweet_text'])
    tvec_feature_names = tvec.get_feature_names()
    
    nmf = NMF(n_components= quant_components, random_state=42, alpha=.1, l1_ratio=.5, init='nndsvd')
    nmf.fit(tvec.transform(df_input['tweet_text']).todense())
    
    topic_list = []
    for topic in nmf.components_:
        topic_list.append(' '.join([tvec_feature_names[i] for i in topic.argsort()[:-quant_top_words -1:-1]]))
        
    data = {'topic_words': topic_list}
    df_output = pd.DataFrame(data)
  
    return df_output

In [4]:
def nmf_print_automate(df_input, region):
    print()
    print()
    print('Earthquake ' + region)
    for i in range(df_input.shape[0]):
        print(i, df_input['topic_words'][i])

In [5]:
df_nmf_ca    = nmf_automate(df_ca, 10, 20)
df_nmf_chile = nmf_automate(df_chile, 10, 20)
df_nmf_nepal = nmf_automate(df_nepal, 10, 20)
df_nmf_pak   = nmf_automate(df_pak, 10, 20)
df_nmf_supra = nmf_automate(df_supra, 10, 50)

In [6]:
df_nmf_ca.to_csv('../project_5_second/datasets/nmf_ca.csv', index= False)
df_nmf_chile.to_csv('../project_5_second/datasets/nmf_chile.csv', index= False)
df_nmf_nepal.to_csv('../project_5_second/datasets/nmf_nepal.csv', index= False)
df_nmf_pak.to_csv('../project_5_second/datasets/nmf_pak.csv', index= False)
df_nmf_supra.to_csv('../project_5_second/datasets/nmf_supra.csv', index= False)

In [7]:
nmf_print_automate(df_nmf_ca, 'California')
nmf_print_automate(df_nmf_chile, 'Chile')
nmf_print_automate(df_nmf_nepal, 'Nepal')
nmf_print_automate(df_nmf_pak, 'Pakistan')
nmf_print_automate(df_nmf_supra, 'Supra Set')



Earthquake California
0 california northern earthquake tco http rt after via rocks strong
1 up jawbone many how data woke napa woken the were
2 news usa hot earthquake heyyouapp news24lhot janinebucks breaking rt abc
3 wine country quake dozens california famed shakes billions strong tco
4 25 years strongest in northern quake california hit is tyleronemo
5 state emergency declared of after jerry brown california rt earthquake
6 of canyon american map details occurred earthquakessf http 6km california
7 60 magnitude by rocked northern hit usa struck rattled fox
9 area bay shakes earthquake damages sleepers county tco strong airport
10 the one is of big was this that in on
11 buckled streets shredding make best skaters by of napa quote
12 more aftershocks scores felt on from way usa of have
13 hurt boy dead should be quake in badly bricks california
14 injured critically dozens after strong usa hospital in treated three
15 napa damage http tco valley in quake earthquake rt drone
16 cou