In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [35]:
with open('./assets/stopwords.pkl','rb') as f:
    stopwords = pickle.load(f)

In [36]:
gun_stop = ['shooting', 'guns', 'gun', 'shoot', 'bullets', 'bullet', 'shootings', 
            'clip', 'bar', 'college','student','students', 'magazine', 'ammo', 'ammunition','shootings',
'rampage', 
'gunman', 
'stabbing', 
'slayings', 
'shooter', 
'fatally', 
'killings', 
'killing', 
'shot', 
'unarmed', 
'massacre', 
'slaying', 
'incident', 
'stabbings',
'altercation',
'shooters', 
'murder', 
'shoot', 
'carjacking'
'gunned', 
'murders', 
'firing', 
'newtown',
'scuffle', 'gunmen', 'florida', 'keys', 'key', 'school', 'ian', 'david','texas', 'miami', 'california', 'harvey', 'thousandoaks',
           'assault','houston', 'ventura', 'san','cudjoe', 'oaks', 'en', 'de', 'la', 
            'el', 'tiroteo', 'th','massshooting','californiashooting']

In [37]:
stopwords.extend(gun_stop)

In [38]:
combined_df = pd.read_csv('../project_4/assets/combined_edit_df.csv')

In [39]:
combined_df.shape

(98450, 5)

In [40]:
y = combined_df['disaster']

# Set X as text column.
X = combined_df['text']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                    test_size=.30,
                                                   random_state=42)

In [42]:
tfidf = TfidfVectorizer(stop_words = stopwords, 
                        max_df=0.95, 
                        min_df=5, max_features=1000)

In [43]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

  'stop_words.' % sorted(inconsistent))


In [44]:
from sklearn.linear_model import LogisticRegression

In [47]:
lr = LogisticRegression()
model = lr.fit(X_train_tfidf, y_train)



In [48]:
print(f'LogReg Training score: {model.score(X_train_tfidf, y_train)}')
print(f'LogReg Testing score: {model.score(X_test_tfidf, y_test)}')

LogReg Training score: 0.8550097946745991
LogReg Testing score: 0.8557982055188759


In [52]:
X_train_df_tfidf = pd.SparseDataFrame(X_train_tfidf,
                             columns=tfidf.get_feature_names())

In [53]:
X_train_df_tfidf.shape

(68915, 1000)

In [54]:
X_train_df_tfidf.head()

Unnamed: 0,abc,able,absolutely,access,accident,across,act,active,actually,added,...,year,years,yes,yesterday,yet,yo,york,young,youtube,zone
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,0.303191,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [55]:
X_train_df_tfidf.fillna(0, inplace=True)

In [56]:
X_test_df_tfidf = pd.SparseDataFrame(X_test_tfidf,
                                    columns = tfidf.get_feature_names())

In [57]:
X_test_df_tfidf.fillna(0, inplace=True)
print(X_test_df_tfidf.shape)

(29535, 1000)


In [69]:

n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5,
                                max_features=n_features,
                                stop_words=stopwords)

X_train_tf = tf_vectorizer.fit_transform(X_train)
X_test_tf = tf_vectorizer.transform(X_test)


lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=42)

lda_train = lda.fit_transform(X_train_tf)
lda_test = lda.transform(X_test_tf)


print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf features for LDA...


  'stop_words.' % sorted(inconsistent))



Topics in LDA model:
Topic #0: thousand mass people killed dead pic victims least borderline go power un grill traffic many stop take long another southern
Topic #1: tx know get ever let st show park photo birthday posted drinking bay fun closed downtown party mom watching care
Topic #2: landfall love category makes mph pic made winds lower update fema percent gone begins well says making weekend surge state
Topic #3: pic eye prayers south best eyewall hits everyone evacuation real next mandatory may west reaches thoughts better latest lower evacuations
Topic #4: today tonight stay city largo open way heart world year time much still pic feel multiple safe goes shows high
Topic #5: new live good really hit one watch center updates could hard work pic near post make west bad area times
Topic #6: day like great got look happy morning pic getting think sunday ready game always even lol thanks around beautiful guys
Topic #7: west man would everything someone us ahead pic looking hate abc 

In [70]:
feature_loadings = pd.DataFrame(lda.components_, 
                                columns = tf_vectorizer.get_feature_names(),
                                index = [f'topic_{x}' for x in range(lda.components_.shape[0])]).T

In [71]:
feature_loadings.shape

(1000, 10)

In [72]:
feature_loadings.sort_values('topic_7', ascending=False).head(10)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
west,0.100016,0.100044,0.10004,213.052667,0.100018,152.841279,101.547642,340.233053,37.983505,205.201675
man,0.100037,0.100008,0.100004,0.100006,0.10001,0.10001,0.100011,278.123742,0.10001,0.100007
would,0.100019,0.100009,0.100004,0.100009,0.100015,0.100012,0.100014,240.63545,0.100016,0.100007
everything,0.100008,0.100005,0.100003,0.100005,0.100007,0.100005,0.100005,232.059128,0.10001,0.100006
someone,0.10001,0.10001,0.100005,0.100007,0.100011,0.100012,0.100012,229.937012,0.100018,0.10001
us,80.650146,0.100023,13.109711,52.283461,40.281836,30.984174,0.100026,180.943214,81.146852,59.370289
ahead,0.100003,0.100003,0.100009,0.100037,0.100002,0.100007,0.100006,177.796948,0.100005,0.100012
pic,374.744283,100.209638,411.539007,952.493462,177.676277,184.578221,269.945153,161.442305,164.011349,250.060091
looking,0.100007,0.100005,0.100004,0.100004,0.100012,0.100008,0.100008,160.137978,0.10001,0.100006
hate,0.100009,0.100007,0.100004,0.100004,0.100008,0.100008,0.100011,159.62848,0.100009,0.100006


In [73]:
# Instantiate linear regression model.
logreg = LogisticRegression()



# Fit on Z_train.
logreg.fit(lda_train, y_train)

# Score on training and testing sets.
print(f'Training Score: {round(logreg.score(lda_train, y_train),4)}.')
print(f'Testing Score: {round(logreg.score(lda_test, y_test),4)}.')

Training Score: 0.7776.
Testing Score: 0.7815.


