# Copy items-Copy1.csv file into pandas dataframe and 
# replace all fields that are empty with NaN

In [203]:
import pandas as pd
import numpy as np

# read dataset into pandas dataframe
df_items = pd.read_csv('items-Copy1.csv')

# replace field that's entirely space (or empty) with NaN
df_items = df_items.replace(np.nan, '', regex=True)

In [204]:
df_items['description']

0       Black American racial experience is real. We s...
1       The best costume for Halloween worth posting. ...
2       Keep on whining and crying for your president,...
3       End the whining and crying, end the riots with...
4       Black girls are the definition of national gre...
                              ...                        
3007                      New ideas, old values. Like us!
3008    Secured borders are a national priority. We ne...
3009    Secured borders should be a top priority. We n...
3010                                Bernie for president!
3011    Secured borders are a national priority. Ameri...
Name: description, Length: 3012, dtype: object

# Go through all rows in dataframe and check for if 
1. anger and not fear is in tags
2. fear and not anger is in tags
3. anger and fear are both in tags
4. neither anger or fear are in tags

In [205]:
labels_list_word = [] # holds list of labels in word form
labels_list_numerical = [] # holds list of labels in numerical form
descriptions_list = [] # holds list of descriptions

for i, row in df_items.iterrows():
    row['tag'] = row['tag'].lower()
    if 'anger' in row['tag'] and not 'fear' in row['tag']:
        labels_list_word.append('anger') # anger
        descriptions_list.append(row['description'])
        labels_list_numerical.append(0) # 0
    elif not 'anger' in row['tag'] and 'fear' in row['tag']:
        labels_list_word.append('fear') # fear
        descriptions_list.append(row['description'])
        labels_list_numerical.append(1) # 1
    elif 'anger' in row['tag'] and 'fear' in row['tag']:
        labels_list_word.append('both') # both
        descriptions_list.append(row['description'])
        labels_list_numerical.append(2) # 2
    else:
        labels_list_word.append('neither') # neither
        descriptions_list.append(row['description'])
        labels_list_numerical.append(3) # 3

df = pd.DataFrame() # create empty dataframe
df['label word'] = labels_list_word # append labels_list_word to df with column header 'label word'
df['label numerical'] = labels_list_numerical # append labels_list_numerical to df with column header 'label numerical'
df['description'] = descriptions_list # append descriptions_list to df with column header 'description'
df.loc[df['label numerical'] != 0]

Unnamed: 0,label word,label numerical,description
0,fear,1,Black American racial experience is real. We s...
4,neither,3,Black girls are the definition of national gre...
5,neither,3,"Imma stay here comfy and untouched, yet workin..."
6,neither,3,There is a disgusting video Circulating on the...
10,neither,3,Unapologetically melaneted Kings and Queens ar...
...,...,...,...
3007,neither,3,"New ideas, old values. Like us!"
3008,neither,3,Secured borders are a national priority. We ne...
3009,neither,3,Secured borders should be a top priority. We n...
3010,neither,3,Bernie for president!


# Go through all rows in dataframe and check for if
1. anger is in tags
2. anger is not in tags

## This is a binary classification task

In [206]:
labels_list_word = [] # holds list of labels in word form
labels_list_numerical = [] # holds list of labels in numerical form
descriptions_list = [] # holds list of descriptions

# binary classification so either anger or not
for i, row in df_items.iterrows():
    row['tag'] = row['tag'].lower() # convert tags to lowercase
    if 'anger' in row['tag']:
        labels_list_word.append('anger') # anger
        descriptions_list.append(row['description']) # add description
        labels_list_numerical.append(1) # 1
    else:
        labels_list_word.append('none') # not anger
        descriptions_list.append(row['description']) # add description
        labels_list_numerical.append(0) # 0

df_anger = pd.DataFrame() # create empty dataframe
df_anger['label word'] = labels_list_word # append labels_list_word to df with column header 'label word'
df_anger['label numerical'] = labels_list_numerical # append labels_list_numerical to df with column header 'label numerical'
df_anger['description'] = descriptions_list # append descriptions_list to df with column header 'description'
df_anger.loc[df_anger['label word'] == 'anger']

Unnamed: 0,label word,label numerical,description
1,anger,1,The best costume for Halloween worth posting. ...
2,anger,1,"Keep on whining and crying for your president,..."
3,anger,1,"End the whining and crying, end the riots with..."
7,anger,1,Art imitates life art. This photo is great.
8,anger,1,"In America, racial oppression and racism were ..."
...,...,...,...
2964,anger,1,Officials of the Highlands High School are inv...
2976,anger,1,"Protect the 2nd. Without it, you won't have an..."
2989,anger,1,"Protect the 2nd. Without it, you won't have an..."
2995,anger,1,"Remember folks, dance and music is a large par..."


# Go through all rows in dataframe and check for if
1. fear is in tags
2. fear is not in tags

## This is a binary classification task

In [207]:
labels_list_word = [] # holds list of labels in word form
labels_list_numerical = [] # holds list of labels in numerical form
descriptions_list = [] # holds list of descriptions

# binary classification so either fear or not
for i, row in df_items.iterrows():
    row['tag'] = row['tag'].lower() # convert tags to lowercase
    if 'fear' in row['tag']:
        labels_list_word.append('fear') # fear
        descriptions_list.append(row['description']) # add description
        labels_list_numerical.append(1) # 1
    else:
        labels_list_word.append('none') # not anger
        descriptions_list.append(row['description']) # add description
        labels_list_numerical.append(0) # 0

df_fear = pd.DataFrame() # create empty dataframe
df_fear['label word'] = labels_list_word # append labels_list_word to df with column header 'label word'
df_fear['label numerical'] = labels_list_numerical # append labels_list_numerical to df with column header 'label numerical'
df_fear['description'] = descriptions_list # append descriptions_list to df with column header 'description'
df_fear.loc[df_fear['label word'] == 'fear']

Unnamed: 0,label word,label numerical,description
0,fear,1,Black American racial experience is real. We s...
11,fear,1,Watch this heart-piercing story about a racial...
15,fear,1,People are genuinely scared for their futures!...
23,fear,1,"For years, white supremacists in the Dothan, A..."
25,fear,1,The cop beat this man like he was a runaway sl...
...,...,...,...
2973,fear,1,Give your online shopping a fresh start with t...
2976,fear,1,"Protect the 2nd. Without it, you won't have an..."
2987,fear,1,People really need to understand
2989,fear,1,"Protect the 2nd. Without it, you won't have an..."


# Go through all rows in dataframe and check for if
1. anger and fear are both in tags
2. neither are in the tags

## This is a binary classification task

In [208]:
labels_list_word = [] # holds list of labels in word form
labels_list_numerical = [] # holds list of labels in numerical form
descriptions_list = [] # holds list of descriptions

# binary classification so either fear or not
for i, row in df_items.iterrows():
    row['tag'] = row['tag'].lower() # convert tags to lowercase
    if 'fear' in row['tag'] and 'anger' in row['tag']:
        labels_list_word.append('both') # both
        descriptions_list.append(row['description']) # add description
        labels_list_numerical.append(1) # 1
    else:
        labels_list_word.append('none') # neither
        descriptions_list.append(row['description']) # add description
        labels_list_numerical.append(0) # 0

df_both = pd.DataFrame() # create empty dataframe
df_both['label word'] = labels_list_word # append labels_list_word to df with column header 'label word'
df_both['label numerical'] = labels_list_numerical # append labels_list_numerical to df with column header 'label numerical'
df_both['description'] = descriptions_list # append descriptions_list to df with column header 'description'
df_both.loc[df_both['label word'] == 'both']

Unnamed: 0,label word,label numerical,description
15,both,1,People are genuinely scared for their futures!...
23,both,1,"For years, white supremacists in the Dothan, A..."
25,both,1,The cop beat this man like he was a runaway sl...
40,both,1,There is a disgusting video Circulating on the...
41,both,1,Black American racial experience is real. We s...
...,...,...,...
2930,both,1,Black Matters. Black community.
2954,both,1,Fast-growing black community. Latest news and ...
2964,both,1,Officials of the Highlands High School are inv...
2976,both,1,"Protect the 2nd. Without it, you won't have an..."


# Create train test split for classification for anger or not anger

In [209]:
from sklearn.model_selection import train_test_split

X_train_anger, X_test_anger, Y_train_anger, Y_test_anger = train_test_split(df_anger['description'], 
                                                                            df_anger['label numerical'], 
                                                                            random_state=1)

print('Number of rows in the total set for anger: {}'.format(df_anger.shape[0]))
print('Number of rows in the training set for anger: {}'.format(X_train_anger.shape[0]))
print('Number of rows in the test set for anger: {}'.format(X_test_anger.shape[0]))

Number of rows in the total set for anger: 3012
Number of rows in the training set for anger: 2259
Number of rows in the test set for anger: 753


# Create an instance of CountVectorizer
# Fit training data and return matrix
# transform testing data and return matrix

In [210]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate Countvectorizer method
count_vector_anger = CountVectorizer()

# fit training data and return matrix
training_data_anger = count_vector_anger.fit_transform(X_train_anger)

# transform testing data and return matrix
testing_data_anger = count_vector_anger.transform(X_test_anger)

# Utilize MultinomialNB from sklearn to create a naive bayes classifier and form predictions

In [211]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes_anger = MultinomialNB()
naive_bayes_anger.fit(training_data_anger, Y_train_anger)

predictions_anger = naive_bayes_anger.predict(testing_data_anger)

In [None]:
!pip3 install matplotlib==3.1.0 # use this version of matplotlib as other version causes problems with seaborn



# Print out classification report for anger vs not anger

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn

print('Classification report for anger classification: ')
print('---------------------------------------------------------- ')
print(classification_report(Y_test_anger, predictions_anger, target_names = ['anger', 'none']))
print('---------------------------------------------------------- ')
print('Accuracy score: ', format(accuracy_score(predictions_anger, Y_test_anger)))
print('Precision score: ', format(precision_score(predictions_anger, Y_test_anger)))
print('Recall score: ', format(recall_score(predictions_anger, Y_test_anger)))
print('F1 score: ', format(f1_score(predictions_anger, Y_test_anger)))
print('---------------------------------------------------------- ')

labels = ['anger', 'none']
cm = confusion_matrix(list(Y_test_anger), predictions_anger)
print("Confusion matrix anger: \n")
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion Matrix Anger')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_anger.png')
plt.show()

# Create train test split for classification for fear or not fear

In [None]:
from sklearn.model_selection import train_test_split

X_train_fear, X_test_fear, Y_train_fear, Y_test_fear = train_test_split(df_fear['description'], 
                                                                            df_fear['label numerical'], 
                                                                            random_state=1)

print('Number of rows in the total set for fear: {}'.format(df_fear.shape[0]))
print('Number of rows in the training set for fear: {}'.format(X_train_fear.shape[0]))
print('Number of rows in the test set for fear: {}'.format(X_test_fear.shape[0]))

# Create an instance of CountVectorizer
# Fit training data and return matrix
# transform testing data and return matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate Countvectorizer method
count_vector_fear = CountVectorizer()

# fit training data and return matrix
training_data_fear = count_vector_fear.fit_transform(X_train_fear)

# transform testing data and return matrix
testing_data_fear = count_vector_fear.transform(X_test_fear)

# Utilize MultinomialNB from sklearn to create a naive bayes classifier and form predictions

In [None]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes_fear = MultinomialNB()
naive_bayes_fear.fit(training_data_fear, Y_train_fear)

predictions_fear = naive_bayes_fear.predict(testing_data_fear)

# Print out classification report for fear vs not fear

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn

print('Classification report for fear classification: ')
print('---------------------------------------------------------- ')
print(classification_report(Y_test_fear, predictions_fear, target_names = ['fear', 'none']))
print('---------------------------------------------------------- ')
print('Accuracy score: ', format(accuracy_score(predictions_fear, Y_test_fear)))
print('Precision score: ', format(precision_score(predictions_fear, Y_test_fear)))
print('Recall score: ', format(recall_score(predictions_fear, Y_test_fear)))
print('F1 score: ', format(f1_score(predictions_fear, Y_test_fear)))
print('---------------------------------------------------------- ')

labels = ['fear', 'none']
cm = confusion_matrix(list(Y_test_fear), predictions_fear)
print("Confusion matrix fear: \n")
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion Matrix Fear')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_fear.png')
plt.show()

# Create train test split for classification for both or not both

In [None]:
from sklearn.model_selection import train_test_split

X_train_both, X_test_both, Y_train_both, Y_test_both = train_test_split(df_both['description'], 
                                                                            df_both['label numerical'], 
                                                                            random_state=1)

print('Number of rows in the total set for both: {}'.format(df_both.shape[0]))
print('Number of rows in the training set for both: {}'.format(X_train_both.shape[0]))
print('Number of rows in the test set for both: {}'.format(X_test_both.shape[0]))

# Create an instance of CountVectorizer
# Fit training data and return matrix
# transform testing data and return matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate Countvectorizer method
count_vector_both = CountVectorizer()

# fit training data and return matrix
training_data_both = count_vector_both.fit_transform(X_train_both)

# transform testing data and return matrix
testing_data_both = count_vector_both.transform(X_test_both)

# Utilize MultinomialNB from sklearn to create a naive bayes classifier and form predictions

In [None]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes_both = MultinomialNB()
naive_bayes_both.fit(training_data_both, Y_train_both)

predictions_both = naive_bayes_both.predict(testing_data_both)

# Print out classification report for both vs not both

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn

print('Classification report for both classification: ')
print('---------------------------------------------------------- ')
print(classification_report(Y_test_both, predictions_both, target_names = ['both', 'none']))
print('---------------------------------------------------------- ')
print('Accuracy score: ', format(accuracy_score(predictions_both, Y_test_both)))
print('Precision score: ', format(precision_score(predictions_both, Y_test_both)))
print('Recall score: ', format(recall_score(predictions_both, Y_test_both)))
print('F1 score: ', format(f1_score(predictions_both, Y_test_both)))
print('---------------------------------------------------------- ')

labels = ['both', 'none']
cm = confusion_matrix(list(Y_test_both), predictions_both)
print("Confusion matrix both: \n")
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion Matrix Both')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_both.png')
plt.show()

# Sensitivity measures the proportion of actual positives that are correctly identified as such. In probability notation: P(T+|D+) = TP / (TP+FN).

# Specificity measures the proportion of actual negatives that are correctly identified as such. In probability notation: P(T-|D-) = TN / (TN + FP).

In [None]:
def perf_measure(y_actual, y_hat):
    '''
    Description:
        Takes in ground truth and predicted values and through a series
        of comparisons determines the number of True Positives (TP), False
        Positives (FP), True Negatives (TN), False Negatives (FN) and 
        returns these values in a tuple.
    Input:
        y_actual: Actual values of y set
        y_hat: Predicted values of y set
    Output:
        (TP, FP, TN, FN): Tuple of performance measures
    '''
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    # Go through all values
    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1: # True Positive
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]: # False Positive
           FP += 1
        if y_actual[i]==y_hat[i]==0: # True Negative
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]: # False Negative
           FN += 1

    return(TP, FP, TN, FN)

# Sensitivity and Specificity measures for anger, fear and both classifications before any adjustments for imbalance

In [None]:
TP_anger, FP_anger, TN_anger, FN_anger = perf_measure(list(Y_test_anger), list(predictions_anger))
sensitivity_anger = TP_anger / (TP_anger+FN_anger)
specificity_anger = TN_anger / (TN_anger + FP_anger)
print("Sensitivity Measure for Anger Classification: {sensitivity_anger}".format(sensitivity_anger=str(sensitivity_anger)))
print("Specificity Measure for Anger Classification: {specificity_anger}".format(specificity_anger=str(specificity_anger)))

In [None]:
TP_fear, FP_fear, TN_fear, FN_fear = perf_measure(list(Y_test_fear), list(predictions_fear))
sensitivity_fear = TP_fear / (TP_fear+FN_fear)
specificity_fear = TN_fear / (TN_fear + FP_fear)
print("Sensitivity Measure for Fear Classification: {sensitivity_fear}".format(sensitivity_fear=str(sensitivity_fear)))
print("Specificity Measure for Fear Classification: {specificity_fear}".format(specificity_fear=str(specificity_fear)))

In [None]:
TP_both, FP_both, TN_both, FN_both = perf_measure(list(Y_test_both), list(predictions_both))
sensitivity_both = TP_both / (TP_both+FN_both)
specificity_both = TN_both / (TN_both + FP_both)
print("Sensitivity Measure for Both Classification: {sensitivity_both}".format(sensitivity_both=str(sensitivity_both)))
print("Specificity Measure for Both Classification: {specificity_both}".format(specificity_both=str(specificity_both)))

# Let's try under-sampling 

In [None]:
import seaborn as sns

# remove (2231 - 781) = 1450 negative samples from overall set for anger 
# this will ensure that the number of positive and negative samples are equal
pos_anger_df = df_anger.loc[df_anger['label numerical'] == 1]

neg_anger_df = df_anger.loc[df_anger['label numerical'] == 0].sample(n=781, random_state=42)

normalized_anger_df = pd.concat([pos_anger_df, neg_anger_df])

#plot the dataset after the undersampling
plt.figure(figsize=(8, 8))
sns.countplot('label numerical', data=normalized_anger_df)
plt.title('Balanced Classes')
plt.show()

# Repeat steps from above for anger classification again after undersamplilng

In [None]:
X_train_anger, X_test_anger, Y_train_anger, Y_test_anger = train_test_split(normalized_anger_df['description'], 
                                                                            normalized_anger_df['label numerical'], 
                                                                            random_state=1)

print('Number of rows in the total set for anger: {}'.format(normalized_anger_df.shape[0]))
print('Number of rows in the training set for anger: {}'.format(X_train_anger.shape[0]))
print('Number of rows in the test set for anger: {}'.format(X_test_anger.shape[0]))

###################################################################################

# instantiate Countvectorizer method
count_vector_anger = CountVectorizer()

# fit training data and return matrix
training_data_anger = count_vector_anger.fit_transform(X_train_anger)

# transform testing data and return matrix
testing_data_anger = count_vector_anger.transform(X_test_anger)

###################################################################################

naive_bayes_anger = MultinomialNB()
naive_bayes_anger.fit(training_data_anger, Y_train_anger)

predictions_anger = naive_bayes_anger.predict(testing_data_anger)

###################################################################################

print('Classification report for undersampled anger classification: ')
print('---------------------------------------------------------- ')
print(classification_report(Y_test_anger, predictions_anger, target_names = ['anger', 'none']))
print('---------------------------------------------------------- ')
print('Accuracy score: ', format(accuracy_score(predictions_anger, Y_test_anger)))
print('Precision score: ', format(precision_score(predictions_anger, Y_test_anger)))
print('Recall score: ', format(recall_score(predictions_anger, Y_test_anger)))
print('F1 score: ', format(f1_score(predictions_anger, Y_test_anger)))
print('---------------------------------------------------------- ')

labels = ['anger', 'none']
cm = confusion_matrix(list(Y_test_anger), predictions_anger)
print("Confusion matrix undersampled anger: \n")
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion Matrix Undersampled Anger')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_undersampled_anger.png')
plt.show()

In [None]:
def lr_cv(splits, X, Y, pipeline, average_method):
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for train, test in kfold.split(X, Y):
        lr_fit = pipeline.fit(X[train], Y[train])
        prediction = lr_fit.predict(X[test])
        scores = lr_fit.score(X[test],Y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average=average_method)*100)
        print('              negative     positive')
        print('precision:',precision_score(Y[test], prediction, average=None))
        recall.append(recall_score(Y[test], prediction, average=average_method)*100)
        print('recall:   ',recall_score(Y[test], prediction, average=None))
        f1.append(f1_score(Y[test], prediction, average=average_method)*100)
        print('f1 score: ',f1_score(Y[test], prediction, average=None))
        print('-'*50)

    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))

In [None]:
import codecs
import unidecode
import re
import spacy
nlp = spacy.load('en')

def spacy_cleaner(text):
    try:
        decoded = unidecode.unidecode(codecs.decode(text, 'unicode_escape'))
    except:
        decoded = unidecode.unidecode(text)
    apostrophe_handled = re.sub("’", "'", decoded)
    expanded = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    parsed = nlp(expanded)
    final_tokens = []
    for t in parsed:
        if t.is_punct or t.is_space or t.like_num or t.like_url or str(t).startswith('@'):
            pass
        else:
            if t.lemma_ == '-PRON-':
                final_tokens.append(str(t))
            else:
                sc_removed = re.sub("[^a-zA-Z]", '', str(t.lemma_))
                if len(sc_removed) > 1:
                    final_tokens.append(sc_removed)
    joined = ' '.join(final_tokens)
    spell_corrected = re.sub(r'(.)\1+', r'\1\1', joined)
    return spell_corrected

In [None]:

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" } 

In [None]:
[spacy_cleaner(t) for t in df_anger.description[:10]]

In [None]:
df_anger['clean_text'] = [spacy_cleaner(t) for t in df_anger['description']]

In [None]:
X_SMOTE.shape

In [None]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3))
lr = LogisticRegression()

# ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),lr)
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777), lr)

tv = TfidfVectorizer(stop_words=None, max_features=100000)
testing_tfidf = tv.fit_transform(df_anger['clean_text'])

'''ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(testing_tfidf, df_anger['label numerical'])
pd.DataFrame(testing_tfidf.todense(), columns=tv.get_feature_names())

pd.DataFrame(X_ROS.todense(), columns=tv.get_feature_names())

y_ROS'''

df_labelnumerical_temp = df_anger['label numerical']

smt = SMOTE(random_state=777, k_neighbors=1)
X_SMOTE, y_SMOTE = smt.fit_sample(testing_tfidf, df_anger['label numerical'])
pd.DataFrame(X_SMOTE.todense(), columns=tv.get_feature_names())

#lr_cv(5, df_anger.clean_text, df_labelnumerical_temp, SMOTE_pipeline, 'macro')

##################################################################

X_train_anger, X_test_anger, Y_train_anger, Y_test_anger = train_test_split(X_SMOTE, 
                                                                            y_SMOTE, 
                                                                            random_state=1)

#print('Number of rows in the total set for anger: {}'.format(normalized_anger_df.shape[0]))
print('Number of rows in the training set for anger: {}'.format(X_train_anger.shape[0]))
print('Number of rows in the test set for anger: {}'.format(X_test_anger.shape[0]))

###################################################################################
'''
# instantiate Countvectorizer method
count_vector_anger = CountVectorizer()

# fit training data and return matrix
training_data_anger = count_vector_anger.fit_transform(X_train_anger)

# transform testing data and return matrix
testing_data_anger = count_vector_anger.transform(X_test_anger)
'''
###################################################################################

naive_bayes_anger = MultinomialNB()
naive_bayes_anger.fit(X_train_anger, Y_train_anger)

predictions_anger = naive_bayes_anger.predict(X_test_anger)

###################################################################################

print('Classification report for oversampled anger classification: ')
print('---------------------------------------------------------- ')
print(classification_report(Y_test_anger, predictions_anger, target_names = ['anger', 'none']))
print('---------------------------------------------------------- ')
print('Accuracy score: ', format(accuracy_score(predictions_anger, Y_test_anger)))
print('Precision score: ', format(precision_score(predictions_anger, Y_test_anger)))
print('Recall score: ', format(recall_score(predictions_anger, Y_test_anger)))
print('F1 score: ', format(f1_score(predictions_anger, Y_test_anger)))
print('---------------------------------------------------------- ')

labels = ['anger', 'none']
cm = confusion_matrix(list(Y_test_anger), predictions_anger)
print("Confusion matrix oversampled anger: \n")
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion Matrix Uversampled Anger')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_oversampled_anger.png')
plt.show()