In [40]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [41]:
df=pd.read_csv('Twitter_Data.csv')
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [42]:
def preprocess_text(clean_text):
    # Check if the value is NaN
    if pd.isna(clean_text):
        return ""

    # Convert text to lowercase
    clean_text = clean_text.lower()
    
    # Remove emojis
    clean_text = clean_text.encode('ascii', 'ignore').decode('ascii')
    
    # Remove special characters
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', clean_text)
    
    return clean_text


In [43]:
clean_text = df["clean_text"].apply(preprocess_text)
df["clean_text"] =clean_text

In [44]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [45]:
#remove stopwords or short words
import nltk
from nltk.corpus import stopwords
def remove_stopwords(input_text):
    words = nltk.word_tokenize(input_text)
    stop_words = set(stopwords.words('english'))
    filtered_words=[word for word in words if word.lower() not in stop_words]
    filtered_text=' '.join(filtered_words)
    return filtered_text

In [46]:
df['cleaned_tweet'] = df['clean_text'].astype(str).apply(remove_stopwords)
df.head()

Unnamed: 0,clean_text,category,cleaned_tweet
0,when modi promised minimum government maximum ...,-1.0,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar names modi ...
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...


In [51]:
#tokenization
def tokenize_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    return words

In [52]:
# Convert all values in 'clean_text' column to strings
df['cleaned_tweet'] = df['cleaned_tweet'].astype(str)

# Tokenize the 'clean_text' column
df['tokenized_tweet'] = df['cleaned_tweet'].apply(tokenize_sentence)
df.head()

Unnamed: 0,clean_text,category,cleaned_tweet,tokenized_tweet
0,when modi promised minimum government maximum ...,-1.0,modi promised minimum government maximum gover...,"[modi, promised, minimum, government, maximum,..."
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi,"[talk, nonsense, continue, drama, vote, modi]"
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...,"[say, vote, modi, welcome, bjp, told, rahul, m..."
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar names modi ...,"[asking, supporters, prefix, chowkidar, names,..."
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...,"[answer, among, powerful, world, leader, today..."


In [58]:
df.dropna(subset=['category'], inplace=True)

In [65]:
# Sample text data
texts = df['clean_text'].values
labels = df['category'].values

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy*100)

Accuracy: 91.92820984813622


In [66]:
# Make predictions on the test dataset
y_pred = model.predict(X_test)

In [67]:
# Print classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.92      0.81      0.86      7230
         0.0       0.90      0.98      0.94     10961
         1.0       0.94      0.93      0.93     14404

    accuracy                           0.92     32595
   macro avg       0.92      0.91      0.91     32595
weighted avg       0.92      0.92      0.92     32595

