In [399]:
import pandas as pd
import os
import preprocessor 
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import fasttext
import csv
import datetime
from bs4 import BeautifulSoup
import re
import itertools
import emoji
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ammarahmad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Data

In [400]:
folder_path = "../Data Twitter/Train/"
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('txt'):
        path = os.path.join(folder_path,filename)
        df = pd.read_csv(path,sep='\t',header=None)
        df = df.drop(columns=[df.columns[0],df.columns[3]])
        dfs.append(df)

df_train = pd.concat(dfs)
df_train.columns = ['text','label']
df_train['label_numeric'] = df_train['label'].astype('category').cat.codes


folder_path = "../Data Twitter/Test/"
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('txt'):
        path = os.path.join(folder_path,filename)
        df = pd.read_csv(path,sep='\t',header=None)
        df = df.drop(columns=[df.columns[0],df.columns[3]])
        dfs.append(df)

df_train_2 = pd.concat(dfs)
df_train_2.columns = ['text','label']
df_train_2['label_numeric'] = df_train_2['label'].astype('category').cat.codes

df_train = df_train.append(df_train_2).reset_index()
print('total train samples : ',len(df_train))
print(df_train['label'].value_counts())

class_mapping = list(df_train['label'].astype('category').cat.categories)

total train samples :  6755
fear       2142
anger      1617
joy        1537
sadness    1459
Name: label, dtype: int64


In [401]:
folder_path = "../Data Twitter/Dev/"
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('txt'):
        path = os.path.join(folder_path,filename)
        df = pd.read_csv(path,sep='\t',header=None)
        df = df.drop(columns=[df.columns[0],df.columns[3]])
        dfs.append(df)

df_dev = pd.concat(dfs)
df_dev.columns = ['text','label']
df_dev['label_numeric'] = df_dev['label'].astype('category').cat.codes

folder_path = "../Data Twitter/Dev/without intensity"
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('txt'):
        path = os.path.join(folder_path,filename)
        df = pd.read_csv(path,sep='\t',header=None)
        df = df.drop(columns=[df.columns[0],df.columns[3]])
        dfs.append(df)

df_dev_2 = pd.concat(dfs)
df_dev_2.columns = ['text','label']
df_dev_2['label_numeric'] = df_dev_2['label'].astype('category').cat.codes

df_dev = df_dev.append(df_dev_2).reset_index()
print('total train samples : ',len(df_dev))
print(df_dev['label'].value_counts())

total train samples :  694
fear       220
anger      168
joy        158
sadness    148
Name: label, dtype: int64


# Preprocess Data

In [403]:
preprocessor.set_options(preprocessor.OPT.URL,preprocessor.OPT.RESERVED)
stop_words = nltk.corpus.stopwords.words('english')
def preprocess(text_str):    
    text_str = preprocessor.tokenize(text_str)
    text_str = ' '.join([word for word in text_str.split(' ') if word.lower() not in stop_words])
    return text_str

In [402]:
lemmatizer = WordNetLemmatizer()

def load_dict_smileys():
    
    return {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
        }


def load_dict_contractions_slangs():
    
    cont = {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "i'd":"i would",
        "i'll":"i will",
        "i'm":"i am",
        "i'm'a":"i am about to",
        "i'm'o":"i am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "i've":"i have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks",
        "rn":"right now",
        "atm": "at the moment",
        "idk": "i dont know",
        "cuz": "because",
        "bcuz": "because",
        "ur": "you are",
        "ly": "love you",
        "lol": "laugh out loud",
        "rofl": "rolling on floor laughing",
        "lmao": "laughing my ass off",
        "ok": "okay",
        "ty": "thank you",
        "fav": "favorite",
        "omg": "oh my god"
        }
    
    to_ret = {}
    for k,v in cont.items():
        key = k.lower().replace("'",'')
        to_ret[key] = v.lower()
        
    return to_ret


def strip_accents(text):
    if 'ø' in text or  'Ø' in text:
        #Do nothing when finding ø 
        return text   
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)


def tweet_cleaning_for_sentiment_analysis(tweet):    
    
    #Escaping HTML characters
    tweet = BeautifulSoup(tweet).get_text()
    #Special case not handled previously.
    tweet = tweet.replace('\x92',"'")
    #Removal of account
    tweet = ' '.join(re.sub("(@[A-Za-z0-9_]+)", " ", tweet).split())
    # removal of hashtag
    tweet = tweet.replace('#','')
    #Removal of address
    tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
    #Lower case
    tweet = tweet.lower()
    # rempval of 'RT'
    tweet = tweet.replace('rt','')
    # Removal of Punctuation
    tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
    # removal of stop words
    tweet = ' '.join([word for word in tweet.split(' ') if word not in stop_words])
    # CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
    CONTRACTIONS_SLANGS = load_dict_contractions_slangs()
    tweet = tweet.replace("’","'").replace("'","")
    words = tweet.split()
    reformed = [CONTRACTIONS_SLANGS[word] if word in CONTRACTIONS_SLANGS else word for word in words]
    tweet = " ".join(reformed)

    # Standardizing words - lemmatization
    tweet = ' '.join([lemmatizer.lemmatize(word) for word in tweet.split(' ')]).lower()
    
    #Deal with smileys
    #source: https://en.wikipedia.org/wiki/List_of_emoticons
    SMILEY = load_dict_smileys()  
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)
    # replace emojis with desctiption - remove accents - remove underscores
    tweet = emoji.demojize(tweet).lower()
    tweet = strip_accents(tweet).lower()
    tweet = tweet.replace(":"," ")
    tweet = ' '.join(tweet.split()).replace('_',' ')
    
    # remove new line characters
    tweet = tweet.strip().replace('\\n',' ')
    
    # only keep alphabets
    tweet = ''.join([c for c in tweet if (c.isalpha() or c ==' ')]).strip()
    
    # rempove double spaces and triple spaces
    tweet = tweet.replace('   ',' ').replace('  ',' ')
    
    # one word replacements
    replacement_dict = {
        'u': 'you',
        'v': 'we',
        'r': 'are',
        'w': 'we',
        'n': 'and',
        'nd': 'and',
        '&': 'and'
    }
    
    tweet = ' '.join([replacement_dict[word] if word in replacement_dict else word for word in tweet.split(' ')])
    
    # removal of stop words
    tweet = ' '.join([word for word in tweet.split(' ') if word not in stop_words])

    return tweet
    

In [404]:
df_train['text'] = df_train['text'].apply(tweet_cleaning_for_sentiment_analysis)
df_dev['text'] = df_dev['text'].apply(tweet_cleaning_for_sentiment_analysis)

# Fasttext

In [405]:
def transform_instance(txt,label):
    cur_row = []
    #Prefix the index-ed label with __label__
    label = "__label__" + label  
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(txt))
    return cur_row


def preprocess(input_text,labels, output_file, keep=1):
    i=0
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        for i in range(len(labels)):
            row_output = transform_instance(input_text[i],labels[i])
            csv_writer.writerow(row_output )
            i=i+1
            if i%10000 ==0:
                print(i)
            

            
def prepare_data(lbl):
    
    df_train_0 = df_train[df_train['label'] == lbl]
    df_train_rest = df_train[df_train['label'] != lbl]
    df_train_rest = df_train_rest.sample(int(len(df_train_0)*1.5))
                                         

    df_train_sampled = df_train_0.append(df_train_rest)
    df_train_sampled = df_train_sampled.sample(frac=1).reset_index()
    
    train_labels = df_train_sampled['label'].apply(lambda x: x if x == lbl else 'not-' + lbl ).tolist()
    test_labels = df_dev['label'].apply(lambda x: x if x == lbl else 'not-' + lbl ).tolist()
    
    
    # Preparing the training dataset   
    preprocess(df_train_sampled['text'].tolist(),train_labels, 'tweets' + lbl + '.train')
    # Preparing the validation dataset        
    preprocess(df_dev['text'].tolist(),test_labels, 'tweets' + lbl + '.test')
    
    return test_labels
    
def train(lbl):
    
    Y_test = prepare_data(lbl)
    hyper_params = {"lr": 0.01,
                "epoch": 200,
                "wordNgrams": 1,
                "dim": 100,
                "loss": "softmax",
                "ws": 3}  
    model = fasttext.train_supervised(input='tweets' + lbl + '.train',**hyper_params)
    
    result = model.test('tweets' + lbl + '.train')
    validation = model.test('tweets' + lbl + '.test')
    # DISPLAY ACCURACY OF TRAINED MODEL
    text_line = str(hyper_params) + "\naccuracy:" + str(result[1])  + ",validation:" + str(validation[1]) + '\n' 
    print(text_line)
    
    X_test = df_dev['text'].tolist()
    Y_pred = model.predict(X_test)
    Y_pred = [x[0].split('_')[-1] for x in Y_pred[0]]
    print(classification_report(Y_test,Y_pred))
    return model
    


## Train individual Models

In [406]:
model_joy = train('joy')
model_anger = train('anger')
model_fear = train('fear')
model_sadness = train('sadness')

{'lr': 0.01, 'epoch': 200, 'wordNgrams': 1, 'dim': 100, 'loss': 'softmax', 'ws': 3}
accuracy:0.9981780322748568,validation:0.9481268011527377

              precision    recall  f1-score   support

         joy       0.89      0.89      0.89       158
     not-joy       0.97      0.97      0.97       536

    accuracy                           0.95       694
   macro avg       0.93      0.93      0.93       694
weighted avg       0.95      0.95      0.95       694

{'lr': 0.01, 'epoch': 200, 'wordNgrams': 1, 'dim': 100, 'loss': 'softmax', 'ws': 3}
accuracy:0.9888668975754576,validation:0.9250720461095101

              precision    recall  f1-score   support

       anger       0.84      0.86      0.85       168
   not-anger       0.95      0.95      0.95       526

    accuracy                           0.93       694
   macro avg       0.90      0.90      0.90       694
weighted avg       0.93      0.93      0.93       694

{'lr': 0.01, 'epoch': 200, 'wordNgrams': 1, 'dim': 100, 'los

## Train Multiclass Model

In [409]:
train_labels = df_train['label'].tolist()
test_labels = df_dev['label'].tolist()
# Preparing the training dataset   
preprocess(df_train['text'].tolist(),train_labels, 'tweets.train')
# Preparing the validation dataset        
preprocess(df_dev['text'].tolist(),test_labels, 'tweets.test')

hyper_params = {"lr": 0.01,
                "epoch": 100,
                "wordNgrams": 1,
                "dim": 100,
                "loss": "softmax",
                "ws": 5}  

model_all = fasttext.train_supervised(input='tweets.train',**hyper_params)

result = model_all.test('tweets.train')
validation = model_all.test('tweets.test')
# DISPLAY ACCURACY OF TRAINED MODEL
text_line = str(hyper_params) + "\naccuracy:" + str(result[1])  + ",validation:" + str(validation[1]) + '\n' 
print(text_line)

X_test = df_dev['text'].tolist()
Y_pred = model_all.predict(X_test)
Y_pred = [x[0].split('_')[-1] for x in Y_pred[0]]
print(classification_report(test_labels,Y_pred))

{'lr': 0.01, 'epoch': 100, 'wordNgrams': 1, 'dim': 100, 'loss': 'softmax', 'ws': 5}
accuracy:0.9700962250185048,validation:0.8472622478386167

              precision    recall  f1-score   support

       anger       0.81      0.86      0.83       168
        fear       0.82      0.88      0.85       220
         joy       0.91      0.87      0.89       158
     sadness       0.88      0.76      0.81       148

    accuracy                           0.85       694
   macro avg       0.85      0.84      0.85       694
weighted avg       0.85      0.85      0.85       694



In [410]:
num_wrong = 0
for i in range(len(X_test)):
    if (Y_pred[i]!=test_labels[i]):
        print(X_test[i])
        print('pred : ',Y_pred[i])
        print('actual : ',test_labels[i])
        print('\n\n')
        num_wrong+=1
print(num_wrong)

sometimes get mad something minuscule try ruin somebody life like lose job like get federal prison
pred :  fear
actual :  anger



please canadian player play player lag atrocious fixthisgame trash sfvrefund
pred :  fear
actual :  anger



sorry guy absolutely idea time cam tomorrow keep posted
pred :  fear
actual :  anger



ding wearing look man found arch enemy bed missus angryman
pred :  sadness
actual :  anger



ding wearing look man found arch enemy bed missus angryman scowl
pred :  sadness
actual :  anger



oh brian invite
pred :  fear
actual :  anger



take k number madden low dropped people unhappy
pred :  sadness
actual :  anger



opinion worst delhi govt acrid hypocrisy
pred :  fear
actual :  anger



thirsty chance disagree left even realize something affront bigoted platform
pred :  fear
actual :  anger



baby born soon lifechanging year feel like yesterday sad happy emotionalrollercoaster
pred :  sadness
actual :  anger



coincidentally watched ulzanas raid last nig

In [397]:
model_all.save_model('fasttext_all.bin')

## One vs All

In [364]:
def predict_all(model,X):
    Y_pred = model.predict(df_dev['text'].tolist())
    Y_preds = [x[0].split('_')[-1] for x in Y_pred[0]]
    Y_pred_scores = [x[0] for x in Y_pred[1]]
    return Y_preds, Y_pred_scores

def predict(model_sadness,model_joy,model_anger,model_fear,model_all,X):
    # predict sadness
    sadness_preds , sadness_scores = predict_all(model_sadness,X)
    
    # predict joy
    joy_preds , joy_scores = predict_all(model_joy,X)
    
    # predict anger
    anger_preds , anger_scores = predict_all(model_anger,X)
    
    # predict fear
    fear_preds , fear_scores = predict_all(model_fear,X)
    
    # predict all
    all_preds , all_scores = predict_all(model_all,X)
    
    final_preds = []
    num_not = 0
    for i in range(len(X)):
        predictions = []
        predictions_scores = []
        # check sadness
        if not ('not' in sadness_preds[i]):
            predictions.append(sadness_preds[i])
            predictions_scores.append(sadness_scores[i])
            
        # check joy
        if not ('not' in joy_preds[i]):
            predictions.append(joy_preds[i])
            predictions_scores.append(joy_scores[i])
            
        # check anger
        if not ('not' in anger_preds[i]):
            predictions.append(anger_preds[i])
            predictions_scores.append(anger_scores[i])
            
        # check fear
        if not ('not' in fear_preds[i]):
            predictions.append(fear_preds[i])
            predictions_scores.append(fear_scores[i])
            
        if len(predictions) == 0:
            num_not+=1
            final_preds.append(all_preds[i])
#             prediction_scores = [sadness_scores[i],joy_scores[i],anger_scores[i],fear_scores[i]]
#             predictions = ['sadness','joy','anger','fear']
#             scores = [1 - t for t in prediction_scores]
#             max_index = scores.index(max(scores))
#             final_preds.append(predictions[max_index])
        elif len(predictions) == 1:
            final_preds.append(predictions[0])
        else:
            # choose the one with the biggest score
            max_index = predictions_scores.index(max(predictions_scores))
            final_preds.append(predictions[max_index])
    print(num_not)
    return final_preds

In [365]:
X_test = df_dev['text'].tolist()
Y_pred = predict(model_sadness,model_joy,model_anger,model_fear,model_all,X_test)
print(classification_report(test_labels,Y_pred))

60
              precision    recall  f1-score   support

       anger       0.84      0.86      0.85       168
        fear       0.85      0.85      0.85       220
         joy       0.85      0.87      0.86       158
     sadness       0.84      0.80      0.82       148

    accuracy                           0.85       694
   macro avg       0.85      0.85      0.85       694
weighted avg       0.85      0.85      0.85       694



## One vs All Experiment with Sadness + joy & anger + fear groups

In [443]:
def get_scores(model,X):
    
    preds = model.predict(X,k=2)
    scores_labels = preds[0]
    scores = preds[1]
    
    out = []
    for i in range(len(X)):
        index = 0
        if 'not' in scores_labels[i][0]:
            index = 1
        out.append(scores[i][index])
            
    return np.array(out)

def predict_sum(model_sadness,model_joy,model_anger,model_fear,X):
    
    # predict sadness
    sadness_scores = get_scores(model_sadness,X)
    
    # predict joy
    joy_scores = get_scores(model_joy,X)

    # predict anger
    anger_scores = get_scores(model_anger,X)

    # predict fear
    fear_scores = get_scores(model_fear,X)
    
    # add groups
    sad_joy_scores = sadness_scores + joy_scores
    anger_fear_scores = anger_scores + fear_scores
    
    final_preds = []
    for i in range(len(X)):
        if sad_joy_scores[i] >= anger_fear_scores[i]:
            if sadness_scores[i] >= joy_scores[i]:
                final_preds.append('sadness')
            else:
                final_preds.append('joy')
        else:
            if anger_scores[i] >= fear_scores[i]:
                final_preds.append('anger')
            else:
                final_preds.append('fear')
                
    return final_preds

In [444]:
X_test = df_dev['text'].tolist()
Y_pred = predict_sum(model_sadness,model_joy,model_anger,model_fear,X_test)
print(classification_report(test_labels,Y_pred))

              precision    recall  f1-score   support

       anger       0.82      0.88      0.85       168
        fear       0.87      0.84      0.85       220
         joy       0.87      0.87      0.87       158
     sadness       0.81      0.78      0.79       148

    accuracy                           0.84       694
   macro avg       0.84      0.84      0.84       694
weighted avg       0.84      0.84      0.84       694



# Logistic Regression

### Multiclass

In [388]:
num_features = 3000
model_tfidf = TfidfVectorizer(max_features=num_features)
model_tfidf.fit(df_train['text'])

X_train = model_tfidf.transform(df_train['text']).toarray()
X_test = model_tfidf.transform(df_dev['text']).toarray()

Y_train = df_train['label_numeric']
Y_test = df_dev['label_numeric']

model_t = LogisticRegression(max_iter=200)
model_t.fit(X_train,Y_train)
Y_pred = model_t.predict(X_test)
print(classification_report(Y_test,Y_pred,target_names = class_mapping))

              precision    recall  f1-score   support

       anger       0.88      0.85      0.86       168
        fear       0.80      0.94      0.87       220
         joy       0.96      0.86      0.91       158
     sadness       0.88      0.80      0.84       148

    accuracy                           0.87       694
   macro avg       0.88      0.86      0.87       694
weighted avg       0.87      0.87      0.87       694



### Individual Models

In [387]:
for lbl in [0,1,2,3]:
    
    df_train_0 = df_train[df_train['label_numeric'] == lbl]
    df_train_rest = df_train[df_train['label_numeric'] != lbl]
    df_train_rest = df_train_rest.sample(int(len(df_train_0)*))
                                         

    df_train_sampled = df_train_0.append(df_train_rest)
    df_train_sampled = df_train_sampled.sample(frac=1).reset_index()

    Y_train = df_train_sampled['label_numeric'].apply(lambda x: 0 if x == lbl else 1 ).tolist()
    Y_test = df_dev['label_numeric'].apply(lambda x: 0 if x == lbl else 1 ).tolist()

    num_features = 3000
    model_tfidf = TfidfVectorizer(max_features=num_features)
    model_tfidf.fit(df_train['text'])

    X_train = model_tfidf.transform(df_train_sampled['text']).toarray()
    X_test = model_tfidf.transform(df_dev['text']).toarray()
    

    model_t = LogisticRegression()
    model_t.fit(X_train,Y_train)
    Y_pred = model_t.predict(X_test)
    print(classification_report(Y_test,Y_pred,target_names = [class_mapping[lbl],'not-'+class_mapping[lbl]]))

              precision    recall  f1-score   support

       anger       0.92      0.82      0.87       168
   not-anger       0.94      0.98      0.96       526

    accuracy                           0.94       694
   macro avg       0.93      0.90      0.91       694
weighted avg       0.94      0.94      0.94       694

              precision    recall  f1-score   support

        fear       0.93      0.67      0.78       220
    not-fear       0.87      0.97      0.92       474

    accuracy                           0.88       694
   macro avg       0.90      0.82      0.85       694
weighted avg       0.88      0.88      0.87       694

              precision    recall  f1-score   support

         joy       0.92      0.75      0.83       158
     not-joy       0.93      0.98      0.95       536

    accuracy                           0.93       694
   macro avg       0.93      0.86      0.89       694
weighted avg       0.93      0.93      0.93       694

              preci

### One vs All

In [373]:
num_features = 3000
model_tfidf = TfidfVectorizer(max_features=num_features,ngram_range=(1,1))
model_tfidf.fit(df_train['text'])

X_train = model_tfidf.transform(df_train['text']).toarray()
X_test = model_tfidf.transform(df_dev['text']).toarray()

Y_train = df_train['label_numeric'].tolist()
Y_test = df_dev['label_numeric'].tolist()

LR = OneVsRestClassifier(LogisticRegression(max_iter=200))
LR.fit(X_train, Y_train)
Y_pred = LR.predict(X_test)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       168
           1       0.79      0.94      0.86       220
           2       0.96      0.86      0.91       158
           3       0.88      0.76      0.81       148

    accuracy                           0.86       694
   macro avg       0.87      0.85      0.86       694
weighted avg       0.87      0.86      0.86       694

