In [82]:
import pyLDAvis
import json
import numpy as np
import pandas as pd
from nltk.stem import snowball, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from nltk.corpus import stopwords
from nltk import FreqDist
from itertools import compress

In [2]:
file_name = "Isla Vista - All Excerpts - 1_2_2019.xlsx"
data = pd.read_excel(file_name, sheet_name='Dedoose Excerpts Export')
print(data.shape)
data = data.dropna(axis=0)
print(data.shape)
print(data.columns)

(8131, 53)
(8127, 53)
Index(['StoryID', 'Excerpt', 'CodesApplied_Combined', 'ACCOUNT',
       'ACCOUNT_Cultural', 'ACCOUNT_Individual', 'ACCOUNT_Other',
       'COMMUNITYRECOVERY', 'EVENT', 'GRIEF', 'GRIEF_Individual',
       'GRIEF_Community', 'GRIEF_Societal', 'HERO', 'INVESTIGATION', 'JOURNEY',
       'JOURNEY_Mental', 'JOURNEY_Physical', 'LEGAL', 'MEDIA', 'MISCELLANEOUS',
       'MOURNING', 'MOURNING_Individual', 'MOURNING_Community',
       'MOURNING_Societal', 'PERPETRATOR', 'PHOTO', 'POLICY', 'POLICY_Guns',
       'POLICY_InfoSharing', 'POLICY_MentalHealth', 'POLICY_Other',
       'POLICY_VictimAdv', 'POLICY_OtherAdv', 'POLICY_Practice',
       'PRIVATESECTOR', 'RACECULTURE', 'RESOURCES', 'SAFETY',
       'SAFETY_Community', 'SAFETY_Individual', 'SAFETY_SchoolOrg',
       'SAFETY_Societal', 'SOCIALSUPPORT', 'THREAT', 'THREAT_Assessment',
       'TRAUMA', 'TRAUMA_Physical', 'TRAUMA_Psychological',
       'TRAUMA_Individual', 'TRAUMA_Community', 'TRAUMA_Societal', 'VICTIMS'],
    

In [28]:
excerpts = list(data['Excerpt'])
def stem_tokenizer(doc):
    tokens = word_tokenize(doc) 
    stemmer = snowball.SnowballStemmer("english", ignore_stopwords=True)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    list_tokens = [tok.lower() for tok in stemmed_tokens if tok.isalpha()]
    return(' '.join(list_tokens))
stem_tokenizer(excerpts[3])

'a student last friday kill six peopl and wound more in isla vista before turn his gun on himself comment blame the crime on everyth from misogynist artist to easi access to gun and divorc even has come under scrutini is american cultur to blame for mass murder'

In [7]:
docs = [stem_tokenizer(doc) for doc in excerpts]
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(docs).toarray() 

In [10]:
docs_train, docs_test, y_train, y_test = train_test_split(X, list(data['ACCOUNT']), test_size=0.2, random_state=0) 

In [13]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)  
classifier.fit(docs_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [14]:
y_pred = classifier.predict(docs_test)  

In [15]:
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

[[1220   26]
 [  86  294]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1246
           1       0.92      0.77      0.84       380

    accuracy                           0.93      1626
   macro avg       0.93      0.88      0.90      1626
weighted avg       0.93      0.93      0.93      1626

0.931119311193112


In [51]:
print(len(classifier.feature_importances_) == len(vectorizer.get_feature_names()))
top_feats = np.argsort(classifier.feature_importances_)[-10:]

In [53]:
feat_names = [vectorizer.get_feature_names()[feat] for feat in top_feats]
print(feat_names)

['student', 'culture', 'gun', 'april', 'sex', 'blamed', 'manifesto', 'rodger', 'mental', 'video']


In [71]:
test_doc = np.zeros(len(docs_test[1]))
test_doc[top_feats[0]] = 1
print("class: "+str(classifier.predict([test_doc])))

class: [0]


## with svm

In [36]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel
#Train the model using the training sets
clf.fit(docs_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [37]:
y_pred = clf.predict(docs_test)
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

[[1173   73]
 [  73  307]]
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1246
           1       0.81      0.81      0.81       380

    accuracy                           0.91      1626
   macro avg       0.87      0.87      0.87      1626
weighted avg       0.91      0.91      0.91      1626

0.9102091020910209


## with logistic regression

In [84]:
logreg = LogisticRegression()
logreg.fit(docs_train, y_train)

y_pred = logreg.predict(docs_test)
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))



[[1188   58]
 [  75  305]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      1246
           1       0.84      0.80      0.82       380

    accuracy                           0.92      1626
   macro avg       0.89      0.88      0.88      1626
weighted avg       0.92      0.92      0.92      1626

0.9182041820418204


## with tfidf

In [30]:
docs = [stem_tokenizer(doc) for doc in excerpts]
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
 X = tfidfconverter.fit_transform(docs).toarray() 

In [31]:
docs_train, docs_test, y_train, y_test = train_test_split(X, list(data['ACCOUNT']), test_size=0.2, random_state=0) 
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)  
classifier.fit(docs_train, y_train) 
y_pred = classifier.predict(docs_test)  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

[[1224   22]
 [  87  293]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1246
           1       0.93      0.77      0.84       380

    accuracy                           0.93      1626
   macro avg       0.93      0.88      0.90      1626
weighted avg       0.93      0.93      0.93      1626

0.9329643296432965


## with lemma

In [32]:
excerpts = list(data['Excerpt'])
def lem_tokenizer(doc):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(doc) 
    lemmer = WordNetLemmatizer()
    lemmed_tokens = [lemmer.lemmatize(word) for word in tokens if word.lower() not in stop_words]
    list_tokens = [tok.lower() for tok in lemmed_tokens if tok.isalpha()]
    return(' '.join(list_tokens))
lem_tokenizer(excerpts[3])

'student last friday killed six people wounded isla vista turning gun commenters blamed crime everything misogynistic artist easy access gun divorce even come scrutiny american culture blame mass murder'

In [33]:
docs = [lem_tokenizer(doc) for doc in excerpts]
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(docs).toarray() 

In [34]:
docs_train, docs_test, y_train, y_test = train_test_split(X, list(data['ACCOUNT']), test_size=0.2, random_state=0) 
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)  
classifier.fit(docs_train, y_train) 
y_pred = classifier.predict(docs_test)  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

[[1218   28]
 [  86  294]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1246
           1       0.91      0.77      0.84       380

    accuracy                           0.93      1626
   macro avg       0.92      0.88      0.90      1626
weighted avg       0.93      0.93      0.93      1626

0.9298892988929889
