# Scrape data

In [133]:
# General
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# For Natural Language Processing
import regex as re
import unidecode
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

## Watertown transcript

In [134]:
# import text file, Watertown transcript
with open('watertown.txt', 'r') as file:
    data = file.read()

In [135]:
# Split the text by sentence
df = data.split('\n')
print(df[0])
print(df[2])

One 5'7", the second with darker skin, both suspects armed with firearms, driving a Black Mercedes SUV.  Carjacked at 816 memorial drive at the gas station in Cambridge, Suspects are two middle eastern males, on is 5'7", second one with



In [136]:
type(df[2])

str

In [137]:
len(df[2])

0

In [138]:
# Remove empty sentences
df = [string for string in df if len(string) > 0]

# Put it into the dataframe
df_transcript = pd.DataFrame({'transcript':df})

# Define Watertown text as 'is_crime'
df_transcript['type'] = 'is_crime'

In [139]:
df_transcript

Unnamed: 0,transcript,type
0,"One 5'7"", the second with darker skin, both su...",is_crime
1,"darker skin, no description on clothing yet, a...",is_crime
2,"508, inaudible...supposedly he is saying inaud...",is_crime
3,"Control: they went into that gas station, pa...",is_crime
4,"Yea, I believe they have video in that Shell s...",is_crime
...,...,...
163,"Control, the suspect vehicle is on Spruce - th...",is_crime
164,"(officer at Spruce ) 19, ahh you have a math p...",is_crime
165,"19, I don't know - I gotten word that there's ...",is_crime
166,"521, All non Cambridge units we have two suspe...",is_crime


## Football Commentary

In [140]:
# https://www.news18.com/fifa-world-cup-2018/commentary/21977/
# http://www.chesterfc.com/page/3/?s=LIVE+TEXT+COMMENTARY

In [141]:
# import text file, soccer commentary transcript
with open('football.txt', 'r') as file:
    data2 = file.read()

In [142]:
# Split the text by sentence
df_nonwatertown = data2.split('\n')

# Remove empty sentences
df_nonwatertown = [string for string in df_nonwatertown if len(string) > 0]

# Put it into the dataframe
df_non_transcript = pd.DataFrame({'transcript':df_nonwatertown})

# Define Watertown text as 'is_not_crime'
df_non_transcript['type'] = 'is_not_crime'

In [143]:
df_non_transcript

Unnamed: 0,transcript,type
0,we played nearly six minutes yer list’ning to ...,is_not_crime
1,ROBBIE SAVAGE: (2) Yeh (.) I thought united (....,is_not_crime
2,ALAN GREEN: / Sergio,is_not_crime
3,ROBBIE SAVAGE: /yep/,is_not_crime
4,ALAN GREEN: /Ramos who got the header in,is_not_crime
...,...,...
161,"Portugal Substitutions: Beto, Ruben Dias, Manu...",is_not_crime
162,"Portugal (4-2-3-1): Rui Patricio; Cedric, Pepe...",is_not_crime
163,Spain look to repeat their 2010 World Cup perf...,is_not_crime
164,The 2016 UEFA European Champions will look to ...,is_not_crime


In [144]:
# Combine into single dataframe
df = pd.concat([df_transcript, df_non_transcript])
df = df.reset_index(drop=True)
df['is_crime'] = df['type'].map(lambda t: 1 if t == 'is_crime' else 0)

In [145]:
df.head()

Unnamed: 0,transcript,type,is_crime
0,"One 5'7"", the second with darker skin, both su...",is_crime,1
1,"darker skin, no description on clothing yet, a...",is_crime,1
2,"508, inaudible...supposedly he is saying inaud...",is_crime,1
3,"Control: they went into that gas station, pa...",is_crime,1
4,"Yea, I believe they have video in that Shell s...",is_crime,1


# Cleaning

## Pre-processing

In [146]:
# Drop duplicates and check the base accuracy
df.drop_duplicates(inplace=True)
df['is_crime'].value_counts(normalize=True)

0    0.50303
1    0.49697
Name: is_crime, dtype: float64

In [147]:
# Preprocessing function
def commentaries_to_words(raw_commentary):
    
    # Get rid of accents
    unaccented = unidecode.unidecode(raw_commentary)
    
    # Get rid of punctuation
    letters_only = re.sub("[^a-zA-Z]", " ", unaccented)
    
    # Get all lowercase words
    words = letters_only.lower().split()
    
    # Instantiate and run Lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens_lem = [lemmatizer.lemmatize(i) for i in words]
    
    # Remove stop words
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    
    # Join into string and return the result.
    return(" ".join(meaningful_words))

In [148]:
# Clean all commentary 
total_commentaries = df.shape[0]
clean_commentaries = []

print("Cleaning the commentaries")

i = 0
for commentary in df['transcript']:
    clean_commentaries.append(commentaries_to_words(commentary))
    
    if (i+1) % 100 == 0:
        print(f'Commentary {i+1} of {total_commentaries}.')
        
        i += 1
        
        if i == total_commentaries:
            print('Done.')

Cleaning the commentaries


In [149]:
df = df.assign(clean_commentary = clean_commentaries)
df.head()

Unnamed: 0,transcript,type,is_crime,clean_commentary
0,"One 5'7"", the second with darker skin, both su...",is_crime,1,one second darker skin suspects armed firearms...
1,"darker skin, no description on clothing yet, a...",is_crime,1,darker skin description clothing yet ahh armed...
2,"508, inaudible...supposedly he is saying inaud...",is_crime,1,inaudible supposedly saying inaudible fled tow...
3,"Control: they went into that gas station, pa...",is_crime,1,control went gas station paid cash gas fled to...
4,"Yea, I believe they have video in that Shell s...",is_crime,1,yea believe video shell station victim believe...


In [150]:
df_clean = pd.DataFrame({'commentary': df['clean_commentary'], 'crime': df['is_crime']})
df_clean.head()

Unnamed: 0,commentary,crime
0,one second darker skin suspects armed firearms...,1
1,darker skin description clothing yet ahh armed...,1
2,inaudible supposedly saying inaudible fled tow...,1
3,control went gas station paid cash gas fled to...,1
4,yea believe video shell station victim believe...,1


In [151]:
df_clean.to_csv('cleaned_data.csv', index=False)

In [152]:
df_clean['commentary'].isna().sum()

0

# NLP

## Classification Models

In [153]:
# import libraries

# For classification modeling
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# For evaluation
from sklearn.metrics import confusion_matrix

In [154]:
# import data
df = pd.read_csv('cleaned_data.csv')

In [155]:
# Few examples of crime
df.loc[df['crime']==1, 'commentary'].head()

0    one second darker skin suspects armed firearms...
1    darker skin description clothing yet ahh armed...
2    inaudible supposedly saying inaudible fled tow...
3    control went gas station paid cash gas fled to...
4    yea believe video shell station victim believe...
Name: commentary, dtype: object

In [156]:
# Few examples of not_crime
df.loc[df['crime']==0, 'commentary'].head()

164    played nearly six minutes yer list ning five l...
165    robbie savage yeh thought united know back fou...
166                                    alan green sergio
167                                    robbie savage yep
168                          alan green ramos got header
Name: commentary, dtype: object

In [157]:
df = df.dropna()

### Get baseline accuracy score

In [158]:
df['crime'].value_counts(normalize=True)

0    0.506098
1    0.493902
Name: crime, dtype: float64

### Train/Test Split

In [159]:
# Set features and target
features = df['commentary']

X = features
y = df['crime']

In [160]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

### Bag of Words

In [191]:
# Instantiate CountVectorizer and add some stop words
cv = CountVectorizer(stop_words=['inaudible','okay','portugal','spain'])

# Fit the vectorizer
X_train_cv = cv.fit_transform(X_train)
X_train_cv = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())

# Transform the test set
X_test_cv = cv.transform(X_test)

### Logistic Regression

In [177]:
# Instantiate model
lr = LogisticRegression(solver='lbfgs')

# Fit model
lr.fit(X_train_cv, y_train)

# Get scores
print('CV score:', cross_val_score(lr, X_train_cv, y_train, cv=3).mean())
print('Training accuracy:', lr.score(X_train_cv, y_train))
print('Testing accuracy:', lr.score(X_test_cv, y_test))

CV score: 0.9512195121951219
Training accuracy: 1.0
Testing accuracy: 0.975609756097561


### Naive Bayes

In [178]:
# Instantiate model
nb = MultinomialNB(0.7)

# Fit model
nb.fit(X_train_cv, y_train)

# Get scores
print('CV score:', cross_val_score(nb, X_train_cv, y_train, cv=3).mean())
print('Training accuracy:', nb.score(X_train_cv, y_train))
print('Testing accuracy:', nb.score(X_test_cv, y_test))

CV score: 0.9430894308943089
Training accuracy: 0.9959349593495935
Testing accuracy: 0.9512195121951219


### Random Forest

In [179]:
# Instantiate model
rf = RandomForestClassifier()

In [180]:
rf.fit(X_train_cv, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [181]:
cross_val_score(rf,X_train_cv,y_train).mean()

0.8212244897959184

In [182]:
# Set model params 
rf_params = {
    'n_estimators': [15, 20, 25],
    'max_depth': [None, 1, 2, 3, 4, 5],
    'min_samples_split': [2,3,4]
}

rf_gs = GridSearchCV(rf, param_grid=rf_params)
rf_gs.fit(X_train_cv, y_train)
print(rf_gs.best_score_)
rf_gs.best_params_

0.8785306122448979


{'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 25}

In [183]:
# Store the best fit model as best_rf 
best_rf = rf_gs.best_estimator_

# Evaluate the best fit model on the train and test data
print('train score:', best_rf.score(X_train_cv, y_train))
print('test score:', best_rf.score(X_test_cv, y_test))

train score: 0.926829268292683
test score: 0.8902439024390244


## Evaluation

In [184]:
from sklearn.metrics import confusion_matrix

In [185]:
# Use best scoring model to evaluate
predictions = lr.predict(X_test_cv)
cm = confusion_matrix(y_test, predictions)

In [186]:
# Convert confusion matrix to dataframe
cm_df = pd.DataFrame(cm,
                    columns = ['predicted neg', 'predicted pos'],
                    index = ['actual neg', 'actual pos'])
cm_df

Unnamed: 0,predicted neg,predicted pos
actual neg,41,1
actual pos,1,39


In [187]:
# Create dataframe to show true and predicted values
results = pd.DataFrame()
results['actual'] = y_test
results['predicted'] = predictions

# Create column to show confusion matrix values
results['confusion'] = ''

# Set true positives
results['confusion'] = np.where(((results['actual']==1) & (results['predicted']==1)), 'TP', results['confusion'])

# Set true negatives
results['confusion'] = np.where(((results['actual']==0) & (results['predicted']==0)), 'TN', results['confusion'])

# Set false positives
results['confusion'] = np.where(((results['actual']==0) & (results['predicted']==1)), 'FP', results['confusion'])

# Set false negatives
results['confusion'] = np.where(((results['actual']==1) & (results['predicted']==0)), 'FN', results['confusion'])

results.head(3)

Unnamed: 0,actual,predicted,confusion
109,1,1,TP
93,1,1,TP
52,1,1,TP


In [188]:
coefs = list(zip(cv.get_feature_names(), lr.coef_[0].T))
coefs = pd.DataFrame(coefs, columns = ['word','coef'])
coefs['e^coef'] = np.exp(coefs['coef'])

In [189]:
# Show words most associated with is_Crime
coefs.sort_values(by='e^coef', ascending=False).head(10)

Unnamed: 0,word,coef,e^coef
865,spruce,0.971849,2.642827
993,vehicle,0.880107,2.411157
652,officer,0.771334,2.162649
249,dexter,0.746303,2.109188
1014,watertown,0.731561,2.078322
621,mt,0.682675,1.979165
72,auburn,0.682675,1.979165
198,copy,0.58922,1.802582
520,laurel,0.576691,1.780138
189,confirm,0.561408,1.753138


In [190]:
coefs.sort_values(by='e^coef').head(10)

Unnamed: 0,word,coef,e^coef
84,ball,-0.844388,0.42982
387,get,-0.842814,0.430498
408,good,-0.835097,0.433833
946,time,-0.785553,0.455867
839,silva,-0.777572,0.45952
948,today,-0.697451,0.497852
127,bruno,-0.683084,0.505057
330,fernandes,-0.683084,0.505057
789,ronaldo,-0.680434,0.506397
212,cristiano,-0.680344,0.506443
