In [19]:
import pandas as pd
import nltk
import string
from sklearn.model_selection import cross_val_score

nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to C:\Users\Lianke
[nltk_data]     Qin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Lianke
[nltk_data]     Qin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
stopwords += list(string.punctuation)

In [21]:
df_all_tuples = pd.read_csv('all_tuples.csv')

In [22]:
df_all_tuples = df_all_tuples.fillna(False)

In [23]:
df_all_tuples.head()

Unnamed: 0,document_text,nudity_penetration,sexual_harassment_remarks,sexual_humiliation_extortion_or_sex_work,tasers,trespass,racial_slurs,planting_drugs_guns,neglect_of_duty,refuse_medical_assistance,irrational_aggressive_unstable,searching_arresting_minors
0,SUMMARY REPORT DIGEST- COMPLAINT REGISTER INVE...,False,False,False,False,False,False,False,False,False,False,False
1,COMMAND CHANNEL REVIEW A Sasso Emp 14982 NW 1 ...,False,False,False,False,False,False,False,False,False,False,False
2,SUMMARY REPORT DIGEST- COMPLAINT REGISTER INVE...,False,False,False,True,False,False,True,False,False,False,False
3,SUMMARY REPORT DIGEST- DATE OF REPORTIDAY-MO.-...,False,False,False,False,False,False,False,False,False,False,False
4,SUM MARY REPORT I DATE OF REPORT IDAY-MO COMP...,False,False,False,False,False,False,False,False,False,False,False


In [24]:
label_cols = ['nudity_penetration', 'sexual_harassment_remarks','sexual_humiliation_extortion_or_sex_work', 'tasers', 'trespass','racial_slurs', 'planting_drugs_guns', 'neglect_of_duty','refuse_medical_assistance', 'irrational_aggressive_unstable','searching_arresting_minors']

In [25]:
for label in label_cols:
    print(label + ': ' + str(sum(df_all_tuples[label])))

nudity_penetration: 1
sexual_harassment_remarks: 1
sexual_humiliation_extortion_or_sex_work: 3
tasers: 46
trespass: 26
racial_slurs: 6
planting_drugs_guns: 9
neglect_of_duty: 8
refuse_medical_assistance: 0
irrational_aggressive_unstable: 1
searching_arresting_minors: 1


In [26]:
df_minority_class = df_all_tuples.loc[(df_all_tuples == True).any(axis=1)]

In [27]:
len(df_minority_class), len(df_all_tuples)

(90, 1220)

In [28]:
(1220 - 90) / 90

12.555555555555555

In [29]:
df_oversampled = df_all_tuples.append([df_minority_class]*12,ignore_index=True)

In [30]:
df_oversampled.head()

Unnamed: 0,document_text,nudity_penetration,sexual_harassment_remarks,sexual_humiliation_extortion_or_sex_work,tasers,trespass,racial_slurs,planting_drugs_guns,neglect_of_duty,refuse_medical_assistance,irrational_aggressive_unstable,searching_arresting_minors
0,SUMMARY REPORT DIGEST- COMPLAINT REGISTER INVE...,False,False,False,False,False,False,False,False,False,False,False
1,COMMAND CHANNEL REVIEW A Sasso Emp 14982 NW 1 ...,False,False,False,False,False,False,False,False,False,False,False
2,SUMMARY REPORT DIGEST- COMPLAINT REGISTER INVE...,False,False,False,True,False,False,True,False,False,False,False
3,SUMMARY REPORT DIGEST- DATE OF REPORTIDAY-MO.-...,False,False,False,False,False,False,False,False,False,False,False
4,SUM MARY REPORT I DATE OF REPORT IDAY-MO COMP...,False,False,False,False,False,False,False,False,False,False,False


In [31]:
len(df_oversampled)

2300

In [32]:
for label in label_cols:
    print(label + ': ' + str(sum(df_oversampled[label])))

nudity_penetration: 13
sexual_harassment_remarks: 13
sexual_humiliation_extortion_or_sex_work: 39
tasers: 598
trespass: 338
racial_slurs: 78
planting_drugs_guns: 117
neglect_of_duty: 104
refuse_medical_assistance: 0
irrational_aggressive_unstable: 13
searching_arresting_minors: 13


In [33]:
def normalize(txt):
    well_formed_words = [word.lower() for word in nltk.tokenize.word_tokenize(txt.strip()) if '?' not in word]
    non_stop_words = [word for word in well_formed_words if word not in stopwords]
    return ' '.join(non_stop_words)

In [34]:
df_oversampled['normalized_text'] = df_oversampled['document_text'].apply(lambda x: normalize(x))

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline

In [36]:
train, test = train_test_split(df_oversampled, random_state=42, test_size=0.5, shuffle=True)
X_train = train['normalized_text']
X_test = test['normalized_text']
print(X_train)


1982    summary report qige57complaint register invest...
2278    summary report digest- complaint register inve...
385     member police board city chicago matter recomm...
1322    summary report digest- complaint register inve...
1364    286471 command channel review date initiated c...
                              ...                        
1638    summary report register investigation chicago ...
1095    jill tiilall 1.1 lit-si jiyii lit/lull till-la...
1130    1018691 summary report log 'wa report chicago ...
1294    summary report digestconiplaint register inves...
860     battery report chicago police department instr...
Name: normalized_text, Length: 1150, dtype: object


In [37]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

for label in label_cols:
    print('... Processing {}'.format(label))
    SVC_pipeline.fit(X_train, train[label])
    prediction = SVC_pipeline.predict(X_test)
    print(prediction)
    print(len(prediction))
    print(sum(prediction))
    print('Test F1-score is {}'.format(f1_score(test[label], prediction, 'macro')))

... Processing nudity_penetration
[False False False ... False False False]
1150
8
Test F1-score is 1.0
... Processing sexual_harassment_remarks
[False False False ... False False False]
1150
6
Test F1-score is 1.0
... Processing sexual_humiliation_extortion_or_sex_work
[False False False ... False False False]
1150
16
Test F1-score is 1.0
... Processing tasers
[False False False ... False  True False]
1150
293
Test F1-score is 0.9982905982905983
... Processing trespass
[False  True False ... False False False]
1150
176
Test F1-score is 0.9942857142857142
... Processing racial_slurs
[False False False ... False False False]
1150
39
Test F1-score is 1.0
... Processing planting_drugs_guns
[False False False ... False False False]
1150
62
Test F1-score is 1.0
... Processing neglect_of_duty
[False False False ... False False False]
1150
46
Test F1-score is 1.0
... Processing refuse_medical_assistance


  str(classes[c]))


[False False False ... False False False]
1150
0
Test F1-score is 0.0
... Processing irrational_aggressive_unstable


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


[False False False ... False False False]
1150
8
Test F1-score is 1.0
... Processing searching_arresting_minors
[False False False ... False False False]
1150
8
Test F1-score is 1.0


In [38]:
train, test = train_test_split(df_oversampled, random_state=42, test_size=0.5, shuffle=True)
X_train = train['normalized_text']
X_test = test['normalized_text']
X = df_oversampled['normalized_text']

#key is label, value is corresponding SVM prediction model
SVC_map = {}

for label in label_cols:
    SVC_pipeline = Pipeline([('tfidf', TfidfVectorizer()),('clf', OneVsRestClassifier(LinearSVC(penalty='l2'), n_jobs=8)),])
    SVC_pipeline.fit(X_train, train[label])
    scores = cross_val_score(SVC_pipeline, X, df_oversampled[label], cv=10)
    print(scores)
    SVC_map[label] = SVC_pipeline

[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[0.97830803 1.         1.         0.98910675 1.        ]
[0.97613883 1.         1.         1.         1.        ]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]


In [40]:
output = pd.DataFrame()
for label in label_cols:
    output[label] = SVC_map[label].predict(X_test)
# output['ID'] = test_table['ID']
print(output.head())

   nudity_penetration  sexual_harassment_remarks  \
0               False                      False   
1               False                      False   
2               False                      False   
3               False                      False   
4               False                      False   

   sexual_humiliation_extortion_or_sex_work  tasers  trespass  racial_slurs  \
0                                     False   False     False         False   
1                                     False   False      True         False   
2                                     False   False     False         False   
3                                     False   False     False         False   
4                                     False   False     False         False   

   planting_drugs_guns  neglect_of_duty  refuse_medical_assistance  \
0                False            False                      False   
1                False            False                      False   
2 

In [None]:
prediction = SVC_pipeline.predict(X_test)
print(prediction)