# importing necessary libraries

In [1]:
import pandas as pd 
import numpy as np
import spacy

# Loading data

In [38]:
df=pd.read_csv("C:\\Users\\akhil\\Downloads\\twitter_training.csv\\twitter_training.csv", header=None,nrows=5000)

In [39]:
df.columns = ['ID', 'Category', 'Sentiment', 'text']

In [40]:
df.head()

Unnamed: 0,ID,Category,Sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         5000 non-null   int64 
 1   Category   5000 non-null   object
 2   Sentiment  5000 non-null   object
 3   text       4952 non-null   object
dtypes: int64(1), object(3)
memory usage: 156.4+ KB


In [42]:
df.isna().sum()

ID            0
Category      0
Sentiment     0
text         48
dtype: int64

In [43]:
df.drop_duplicates(inplace=True)

In [44]:
df.isna().sum()

ID            0
Category      0
Sentiment     0
text         27
dtype: int64

In [45]:
df.shape

(4800, 4)

In [46]:
df.columns

Index(['ID', 'Category', 'Sentiment', 'text'], dtype='object')

In [47]:
df.dropna(inplace=True)

In [48]:
df.shape

(4773, 4)

In [49]:
df.isna().sum()

ID           0
Category     0
Sentiment    0
text         0
dtype: int64

In [50]:
# spacy is used for speed and effiency in nlp tasks 
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 12.2 MB/s eta 0:00:01
     --------------- ------------------------ 5.0/12.8 MB 11.6 MB/s eta 0:00:01
     ----------------------- ---------------- 7.6/12.8 MB 11.7 MB/s eta 0:00:01
     ------------------------------ -------- 10.0/12.8 MB 11.9 MB/s eta 0:00:01
     ----------------------------------- --- 11.5/12.8 MB 10.9 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 10.4 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [51]:
nlp = spacy.load("en_core_web_sm")

# Preprocessing

In [52]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [53]:
df['Preprocessed Text'] = df['text'].apply(preprocess) 

In [55]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
le_model = LabelEncoder()
df['text'] = le_model.fit_transform(df['text'])

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed Text'], df['Sentiment'],test_size=0.2, random_state=42, stratify=df['Sentiment'])

In [63]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (3818,)
Shape of X_test:  (955,)


In [64]:
from sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [66]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.81


In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [68]:
rf = RandomForestClassifier()
rf.fit(X_train_vectorized, y_train)

In [69]:
y_pred_rf = rf.predict(X_test_vectorized)

In [70]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Accuracy: 0.9141361256544502
