In [1]:
#This block installs and imports the required libraries. It uses pandas to load and handle data,TfidfVectorizer to turn text into numbers and scikit learn to train model.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [19]:
DF=pd.read_csv(r"Downloads//archive (1)//training.1600000.processed.noemoticon.csv",encoding='latin1',header=None)

In [20]:
DF.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [22]:
DF.columns=['sentiment','id','date','query','user','text']

In [23]:
DF.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [24]:
DF['sentiment'] = DF['sentiment'].replace({0: 'negative', 4: 'positive'})

In [25]:
DF = DF[['text', 'sentiment']]

In [26]:
DF['text'] = DF['text'].fillna('')
DF = DF.dropna(subset=['sentiment'])

In [27]:
DF.head()

Unnamed: 0,text,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,is upset that he can't update his Facebook by ...,negative
2,@Kenichan I dived many times for the ball. Man...,negative
3,my whole body feels itchy and like its on fire,negative
4,"@nationwideclass no, it's not behaving at all....",negative


In [28]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   text       1600000 non-null  object
 1   sentiment  1600000 non-null  object
dtypes: object(2)
memory usage: 24.4+ MB


In [29]:
DF.isna().sum()

text         0
sentiment    0
dtype: int64

In [30]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(DF['text'])
y = DF['sentiment']

In [31]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [32]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb_pred = bnb.predict(X_test)
print("BernoulliNB Accuracy:", accuracy_score(y_test, bnb_pred))
print(classification_report(y_test, bnb_pred))

BernoulliNB Accuracy: 0.758065625
              precision    recall  f1-score   support

    negative       0.76      0.75      0.75    159494
    positive       0.75      0.77      0.76    160506

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000



In [33]:
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

Logistic Regression Accuracy: 0.767771875
              precision    recall  f1-score   support

    negative       0.78      0.74      0.76    159494
    positive       0.76      0.79      0.77    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [34]:
svm = LinearSVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print("Linear SVM Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

Linear SVM Accuracy: 0.767071875
              precision    recall  f1-score   support

    negative       0.78      0.74      0.76    159494
    positive       0.75      0.80      0.77    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [35]:
sample = ["I love this!", "This is terrible."]
sample_vec = tfidf.transform(sample)
svm.predict(sample_vec)

array(['positive', 'negative'], dtype=object)

In [39]:
sample_tweets = ["I love this!", "I hate that!", "It was okay, not great."]
sample_vec = tfidf.transform(sample_tweets)
print("\nSample Predictions:")
print("BernoulliNB:", bnb.predict(sample_vec))
print("SVM:", svm.predict(sample_vec))
print("logistic:", lr.predict(sample_vec))


Sample Predictions:
BernoulliNB: ['positive' 'negative' 'positive']
SVM: ['positive' 'negative' 'positive']
logistic: ['positive' 'negative' 'positive']
