In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Loading the Dataset**

In [None]:
df=pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1')

**Data Preprocessing**

In [None]:
df.shape

In [None]:
df.head()

Clearly, the dataset does not contain the column/feature names, so I assigned names to the features.

In [None]:
col_names=['Target','ID','Date','Flag','User','Text']
df=pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', names=col_names,encoding='ISO-8859-1')
df.shape

In [None]:
df.head()

In [None]:
#Checking for Missing Values
df.isnull().sum()

In [None]:
df['Target'].value_counts()

Renaming the positive class from 4 to 1

0 → Negative Tweet, 1 → Positive Tweet.

In [None]:
df.replace({'Target':{4:1}}, inplace=True)
df['Target'].value_counts()

In [None]:
sns.countplot(x=df['Target'])

**Stemming**

Using the stemming technique in order to reduce a word to its root form.

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
print(stopwords.words('english'))

In [None]:
def stemming(content):
    stem_content=re.sub('[^a-zA-Z]',' ',content)
    stem_content=stem_content.lower()
    stem_content=stem_content.split()
    stem_content=[ps.stem(word) for word in stem_content if not word in stopwords.words('english')]
    stem_content=' '.join(stem_content)
    
    return stem_content

In [None]:
df['stem_content']=df['Text'].apply(stemming)

**Separating Feature and Target Variables**

In [None]:
X=df['stem_content'].values
Y=df['Target'].values

**Splitting the Data into Training and Testing Sets**

To ensure that the distribution of the target variable Y is consistent in both the training and testing sets, I used stratify=Y during the data split.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

**Transforming Text Data with TF-IDF Vectorization**

Using TF-IDF Vectorization in order to convert the text data into numerical features.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
v=TfidfVectorizer()
X_train=v.fit_transform(x_train)
X_test=v.transform(x_test)

**Model Training and Evaluation**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model=LogisticRegression(max_iter=1000)

In [None]:
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred=model.predict(X_test)

In [None]:
print('Accuracy score:',accuracy_score(y_test,y_pred))