In [1]:
import pandas as pd
import os
import preprocessor 
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ammarahmad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Data

In [2]:
folder_path = "../Data Twitter/Train/"
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('txt'):
        path = os.path.join(folder_path,filename)
        df = pd.read_csv(path,sep='\t',header=None)
        df = df.drop(columns=[df.columns[0],df.columns[3]])
        dfs.append(df)

df_train = pd.concat(dfs)
df_train.columns = ['text','label']
df_train['label_numeric'] = df_train['label'].astype('category').cat.codes
print('total train samples : ',len(df_train))
print(df_train['label'].value_counts())

class_mapping = list(df_train['label'].astype('category').cat.categories)

total train samples :  3613
fear       1147
anger       857
joy         823
sadness     786
Name: label, dtype: int64


In [3]:
folder_path = "../Data Twitter/Dev/"
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('txt'):
        path = os.path.join(folder_path,filename)
        df = pd.read_csv(path,sep='\t',header=None)
        df = df.drop(columns=[df.columns[0],df.columns[3]])
        dfs.append(df)

df_dev = pd.concat(dfs)
df_dev.columns = ['text','label']
df_dev['label_numeric'] = df_dev['label'].astype('category').cat.codes
print('total train samples : ',len(df_dev))
print(df_dev['label'].value_counts())

total train samples :  347
fear       110
anger       84
joy         79
sadness     74
Name: label, dtype: int64


# Preprocess Data

In [4]:
preprocessor.set_options(preprocessor.OPT.URL,preprocessor.OPT.RESERVED)
stop_words = nltk.corpus.stopwords.words('english')
def preprocess(text_str):    
    text_str = preprocessor.tokenize(text_str)
    text_str = ' '.join([word for word in text_str.split(' ') if word.lower() not in stop_words])
    return text_str

In [5]:
df_train['text'] = df_train['text'].apply(preprocess)
df_dev['text'] = df_dev['text'].apply(preprocess)

# Prepare Embeddings

In [9]:
num_features = 1000
model_tfidf = TfidfVectorizer(max_features=num_features)
model_tfidf.fit(df_train['text'])

X_train = model_tfidf.transform(df_train['text']).toarray()
X_test = model_tfidf.transform(df_dev['text']).toarray()
Y_train = df_train['label_numeric']
Y_test = df_dev['label_numeric']

# Train Model

In [7]:
model = LogisticRegression()
model.fit(X_train,Y_train)

LogisticRegression()

# Predictions

In [8]:
Y_pred = model.predict(X_test)
print(classification_report(Y_test,Y_pred,target_names=class_mapping))

              precision    recall  f1-score   support

       anger       0.82      0.80      0.81        84
        fear       0.75      0.86      0.80       110
         joy       0.92      0.82      0.87        79
     sadness       0.85      0.77      0.81        74

    accuracy                           0.82       347
   macro avg       0.83      0.81      0.82       347
weighted avg       0.82      0.82      0.82       347

