In [8]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [9]:
# Loading the training and testing data
train_data = pd.read_csv('twitter_training.csv')
test_data = pd.read_csv('twitter_test.csv')



In [10]:
# Renaming columns for easier access
train_data.columns = ['id ' , 'topic' ,'label' ,'text']
test_data.columns = ['id ' , 'topic' ,'label' ,'text']

train_data.head()


Unnamed: 0,id,topic,label,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [16]:
import re
import string

# 1. Remove duplicates & NaN
train_data.drop_duplicates(inplace=True)
train_data.dropna(subset=['text', 'label'], inplace=True)

test_data.drop_duplicates(inplace=True)
test_data.dropna(subset=['text', 'label'], inplace=True)

# 2. Clean text (vectorized)
def fast_clean(df):
    df['clean_text'] = (
        df['text']
        .astype(str)
        .str.lower()
        .str.replace(r"http\S+|www\S+", "", regex=True)
        .str.replace(r"@\w+|#", "", regex=True)
        .str.replace(f"[{re.escape(string.punctuation)}]", "", regex=True)
        .str.replace(r"\d+", "", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    return df

train_data = fast_clean(train_data)
test_data = fast_clean(test_data)

print("✅ Cleaning done! Sample:")
print(train_data[['text', 'clean_text']].head())


✅ Cleaning done! Sample:
                                                text  \
0  I am coming to the borders and I will kill you...   
1  im getting on borderlands and i will kill you ...   
2  im coming on borderlands and i will murder you...   
3  im getting on borderlands 2 and i will murder ...   
4  im getting into borderlands and i can murder y...   

                                          clean_text  
0  i am coming to the borders and i will kill you...  
1  im getting on borderlands and i will kill you all  
2  im coming on borderlands and i will murder you...  
3  im getting on borderlands and i will murder yo...  
4  im getting into borderlands and i can murder y...  


In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_data['label_encoded'] = label_encoder.fit_transform(train_data['label'])
test_data['label_encoded'] = label_encoder.transform(test_data['label'])

print("Label classes:", label_encoder.classes_)


Label classes: ['Irrelevant' 'Negative' 'Neutral' 'Positive']


In [18]:
print("Train size:", len(train_data))
print("Unique words (rough estimate):", 
      train_data['clean_text'].str.split().explode().nunique())


Train size: 71655
Unique words (rough estimate): 39035


In [19]:
import os, psutil
print("Memory available (GB):", psutil.virtual_memory().available / 1e9)
print("Train size:", len(train_data))
print("Clean text type:", train_data['clean_text'].dtype)


Memory available (GB): 10.753277952
Train size: 71655
Clean text type: object


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_data['label'])
y_test = le.transform(test_data['label'])

# Vectorize text (bag-of-words)
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
X_train = vectorizer.fit_transform(train_data['clean_text'])
X_test = vectorizer.transform(test_data['clean_text'])

print("✅ Vectorization done!")
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


✅ Vectorization done!
Train shape: (71655, 10000) Test shape: (999, 10000)


In [21]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
print("Classes:", le.classes_)


Classes: [0 1 2 3]


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr = LogisticRegression(max_iter=500, n_jobs=-1)
lr.fit(X_train, y_train_enc)

y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test_enc, y_pred_lr))
print("\nClassification Report:\n", 
      classification_report(y_test_enc, y_pred_lr, target_names=[str(c) for c in le.classes_]))


Logistic Regression Accuracy: 0.8968968968968969

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.86       171
           1       0.85      0.94      0.89       266
           2       0.96      0.86      0.91       285
           3       0.91      0.92      0.91       277

    accuracy                           0.90       999
   macro avg       0.90      0.89      0.89       999
weighted avg       0.90      0.90      0.90       999



In [24]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=10, verbose=True)
mlp.fit(X_train, y_train_enc)

y_pred_mlp = mlp.predict(X_test)
print("MLP Accuracy:", accuracy_score(y_test_enc, y_pred_mlp))
print("\nClassification Report:\n", 
      classification_report(y_test_enc, y_pred_mlp, target_names=[str(c) for c in le.classes_]))



Iteration 1, loss = 0.82752860
Iteration 2, loss = 0.39271941
Iteration 3, loss = 0.23762496
Iteration 4, loss = 0.16770380
Iteration 5, loss = 0.13422584
Iteration 6, loss = 0.11730184
Iteration 7, loss = 0.10773594
Iteration 8, loss = 0.10360868
Iteration 9, loss = 0.09961193
Iteration 10, loss = 0.09679363
MLP Accuracy: 0.958958958958959

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.95       171
           1       0.97      0.97      0.97       266
           2       0.94      0.96      0.95       285
           3       0.97      0.95      0.96       277

    accuracy                           0.96       999
   macro avg       0.96      0.96      0.96       999
weighted avg       0.96      0.96      0.96       999





In [15]:
import joblib

# Save the model and vectorizer
joblib.dump(mlp, "sentiment_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")

print("✅ Model, Vectorizer, and Encoder saved!")


✅ Model, Vectorizer, and Encoder saved!
