In [None]:
import pandas as pd
import numpy as np
import os
import preprocessor 
import re
import string
import spacy
from spacy.lang.en import stop_words as spacy_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import gensim.downloader as api

# Load Data

In [None]:
folder_path = "../Data Twitter/Train/"
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('txt'):
        path = os.path.join(folder_path,filename)
        df = pd.read_csv(path,sep='\t',header=None)
        df = df.drop(columns=[df.columns[0],df.columns[3]])
        dfs.append(df)

df_train = pd.concat(dfs)
df_train.columns = ['text','label']
df_train['label_numeric'] = df_train['label'].astype('category').cat.codes
print('total train samples : ',len(df_train))
print(df_train['label'].value_counts())

class_mapping = list(df_train['label'].astype('category').cat.categories)

In [None]:
folder_path = "../Data Twitter/Dev/"
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('txt'):
        path = os.path.join(folder_path,filename)
        df = pd.read_csv(path,sep='\t',header=None)
        df = df.drop(columns=[df.columns[0],df.columns[3]])
        dfs.append(df)

df_dev = pd.concat(dfs)
df_dev.columns = ['text','label']
df_dev['label_numeric'] = df_dev['label'].astype('category').cat.codes
print('total train samples : ',len(df_dev))
print(df_dev['label'].value_counts())

# Preprocess Data

In [None]:
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.MENTION)  # removes mentions and URLs only
stop_words = spacy_stopwords.STOP_WORDS
punctuations = string.punctuation

def preprocess(text):
    text = preprocessor.clean(text)
    text = re.sub(r'\W+', ' ', text)  # remove non-alphanumeric characters
    # replace numbers with the word 'number'
    text = re.sub(r"\d+", "number", text)
    text = text.lower()  # lower case everything
    
    return text.strip() # remove redundant spaces

In [None]:
df_train['text'] = df_train['text'].apply(preprocess)
df_dev['text'] = df_dev['text'].apply(preprocess)

# Prepare Embeddings

In [None]:
model_gensim = api.load('glove-twitter-100')

In [None]:
num_features = 100
X_train = np.zeros((len(df_train),num_features))
for i,text in enumerate(df_train['text']):
    count = 0
    for word in text.split(' '):
        try:
            X_train[i] += model_gensim[word]
            count+=1
        except:
            continue
    X_train[i] /= count
    
X_test = np.zeros((len(df_dev),num_features))
for i,text in enumerate(df_dev['text']):
    count = 0
    for word in text.split(' '):
        try:
            X_test[i] += model_gensim[word]
            count+=1
        except:
            continue
    X_test[i] /= count
    
Y_train = df_train['label_numeric']
Y_test = df_dev['label_numeric']

# Train Model

In [None]:
from keras.models import Seq

In [21]:
model = LogisticRegression(max_iter=200)
model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=200)

# Predictions

In [22]:
Y_pred = model.predict(X_test)
print(classification_report(Y_test,Y_pred,target_names=class_mapping))

              precision    recall  f1-score   support

       anger       0.54      0.54      0.54        84
        fear       0.59      0.62      0.60       110
         joy       0.54      0.61      0.57        79
     sadness       0.51      0.41      0.45        74

    accuracy                           0.55       347
   macro avg       0.54      0.54      0.54       347
weighted avg       0.55      0.55      0.55       347

