In [21]:
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk

In [68]:
#read in data, very basic eda
dftrain = pd.read_csv(r'C:\Users\lwu31\OneDrive - JNJ\Documents\train.csv')
train_data, train_labels = dftrain.text, dftrain.target
print("Number of disaster tweets:\n", train_labels.value_counts())
print("\nShape of train data:", dftrain.shape)
print("\nMissing data in each column:\n", dftrain.isnull().sum())
#split train set into disaster and non disaster sets
train_0 = dftrain.loc[dftrain.target == 0]
train_1 = dftrain.loc[dftrain.target == 1]


dftest = pd.read_csv(r'C:\Users\lwu31\OneDrive - JNJ\Documents\test.csv')
print("\n\nShape of test data:", dftest.shape)
print("\nMissing data in each column:\n", dftest.isnull().sum())

Number of disaster tweets:
 0    4342
1    3271
Name: target, dtype: int64

Shape of train data: (7613, 5)

Missing data in each column:
 id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


Shape of test data: (3263, 4)

Missing data in each column:
 id             0
keyword       26
location    1105
text           0
dtype: int64


In [72]:
#text prepocessing function.. not sure if/why the stopwords are not working
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import SnowballStemmer
import string
def preprocess(text):
    #convert all words to lowercase
    text = text.lower()
    tokens = word_tokenize(text)
    #remove digits
    tokens_nonumbers = [token for token in tokens if not token.isdigit()]
    #remove punctuation
    tokens_nopunct = [token for token in tokens if not token in string.punctuation]
    #remove "stop words"
    stwords = stopwords.words('english')
    stwords1 = ['http', 'https']
    stwords = stwords + stwords1
    tokens_nostop = [token for token in tokens if token not in stwords]
    #apply stemming
    stemmer = SnowballStemmer('english')
    tokens_stem = [stemmer.stem(token) for token in tokens]
    new_text = ' '.join(tokens_stem)
    return new_text


In [81]:
#find most common words for all train data
vectorizer = CountVectorizer(preprocessor = preprocess, lowercase = True)
#vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(dftrain.text)
X = vectors.toarray()
pos_df = pd.DataFrame(X, columns = vectorizer.get_feature_names())
pos_dict = {word: pos_df[word].sum() for word in vectorizer.get_feature_names()}
pos_df = pd.DataFrame(pos_dict, index = ['count'])
pos_df = pos_df.T
#pos_df.sort_values(by = 'count', ascending = False).head(15)

In [82]:
#for all train data
pos_df.sort_values(by = 'count', ascending = False).head(15)

Unnamed: 0,count
co,4740
http,4310
the,3275
in,1986
to,1949
of,1831
and,1427
is,969
you,902
for,894


In [75]:
#find most common words for only target = 1 train data
#ok not super helpful with the stop words
vectorizer = CountVectorizer(preprocessor = preprocess, lowercase = True,  ngram_range = (2,2))
vectors_1 = vectorizer.fit_transform(train_1.text)
X = vectors_1.toarray()
pos_df = pd.DataFrame(X, columns = vectorizer.get_feature_names())
pos_dict = {word: pos_df[word].sum() for word in vectorizer.get_feature_names()}
pos_df = pd.DataFrame(pos_dict, index = ['count'])
pos_df = pos_df.T
pos_df.sort_values(by = 'count', ascending = False).head(15)

Unnamed: 0,count
http co,2383
in the,145
https co,133
of the,119
û_ http,109
suicid bomber,60
on the,52
atom bomb,50
mass murder,47
train derail,43


In [102]:
#train data to find most predictive words
lgr = LogisticRegression()
lgr.fit(vectors, train_labels)
df_weights = pd.DataFrame()
df_weights['vocab'] = vectorizer.get_feature_names()
df_weights['coefs'] = lgr.coef_[0]
df_weights.sort_values(by = 'coefs', ascending = False, inplace = True)
print(df_weights.head(15))



           vocab     coefs
7878   hiroshima  2.497942
17866    wildfir  2.399609
5571   earthquak  2.127435
15539      storm  1.982669
16817    typhoon  1.938116
10559    massacr  1.825315
16488    tornado  1.787400
15311      spill  1.757445
6525       flood  1.733247
5398     drought  1.727356
11243     murder  1.685807
6021       evacu  1.650567
7544   hailstorm  1.626117
15657     suicid  1.567204
4942      derail  1.537439


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [106]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

synonyms = []
word = 'earthquake'
for synonym in wordnet.synsets(word):
   for item in synonym.lemmas():
      if word != synonym.name() and len(synonym.lemma_names()) > 1:
        synonyms.append(item.name())

print(synonyms)


['earthquake', 'quake', 'temblor', 'seism']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lwu31\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [132]:
#create dictionary for similar words
#!pip install thefuzz
from thefuzz import fuzz

sim_list = [[sim for sim in vectorizer.get_feature_names() if fuzz.ratio(sim, word)>70] for word in df_weights['vocab'].head(15)]
word_dict = {}
for ind, word in enumerate(df_weights['vocab'].head(15)):
    word_dict[word] = sim_list[ind]
    
print(word_dict)


{'hiroshima': ['hiroshima', 'hiroshima70'], 'wildfir': ['_wildfire__', 'calwildfir', 'dfir', 'idfir', 'wifi', 'wild', 'wildfir', 'wildlif'], 'earthquak': ['earth', 'earthquak', 'earthquake', 'earthquakenew', 'euroquak', 'megaquak'], 'storm': ['abstorm', 'custom', 'duststorm', 'hailstorm', 'histor', 'pastor', 'rainstorm', 'restor', 'sandstorm', 'sector', 'snowstorm', 'sto', 'store', 'storen', 'storey', 'stori', 'storm', 'stormchas', 'stormcom', 'stormy', 'stream', 'tom', 'tor', 'windstorm', 'yycstorm'], 'typhoon': ['typhoon', 'typo'], 'massacr': ['dmassa5', 'mascara', 'mass', 'massacr', 'massacre', 'massag'], 'tornado': ['ronaldo', 'tonysando', 'tora', 'tornado', 'trad', 'turdnado'], 'spill': ['ill', 'philli', 'pill', 'pillow', 'pll', 'silli', 'skill', 'spell', 'spi', 'spill', 'spilt', 'splatl', 'spoil', 'still'], 'flood': ['blood', 'bloodi', 'bloody', 'elwood', 'fleetwood', 'flood', 'floor', 'floored4', 'florid', 'floyd', 'foo', 'food', 'loo', 'ukflood'], 'drought': ['bought', 'brought