In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
 from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [3]:
train_data = pd.read_csv(r"C:\Users\ASUS\Desktop\PRODIGY\twitter sentiment analysis\twitter_training.csv")
validation_data = pd.read_csv(r"C:\Users\ASUS\Desktop\PRODIGY\twitter sentiment analysis\twitter_validation.csv")

In [4]:
train_data.head(5)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
validation_data.head(5)

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [6]:
train_data.columns = ['Header1', 'company','labels','text']
validation_data.columns = ['Header1', 'company','labels','text']

In [7]:
train_data.isnull().sum()

Header1      0
company      0
labels       0
text       686
dtype: int64

In [8]:
validation_data.isnull().sum()

Header1    0
company    0
labels     0
text       0
dtype: int64

In [9]:
train_data.head(3)

Unnamed: 0,Header1,company,labels,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...


In [10]:
train_data.dropna(inplace=True)

In [11]:
train_data.isnull().sum()

Header1    0
company    0
labels     0
text       0
dtype: int64

In [12]:
train_data=train_data.drop(columns=['Header1','company'])
validation_data=validation_data.drop(columns=['Header1','company'])

In [13]:
train_data.head(3)

Unnamed: 0,labels,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...


In [14]:
validation_data.head(3)


Unnamed: 0,labels,text
0,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Negative,@Microsoft Why do I pay for WORD when it funct...
2,Negative,"CSGO matchmaking is so full of closet hacking,..."


In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

In [21]:
def preprocess(text):
    text = text.lower()

    # 2. Removing URLs, mentions, and hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  
    text = re.sub(r"@\w+", '', text)  
    text = re.sub(r"#\w+", '', text)  

    # 3. Handling negations
    negations = {"not", "no", "never", "n't"}
    tokens = text.split()
    for i in range(len(tokens) - 1):
        if tokens[i] in negations:
            tokens[i+1] = "not_" + tokens[i+1]

    # 4. Tokenization
    tokens = nltk.word_tokenize(" ".join(tokens))

    # 5. Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 6. Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 7. Reducing lengthening 
    tokens = [re.sub(r'(.)\1+', r'\1\1', word) for word in tokens]

    # 8. Convert emojis to text
    text = emoji.demojize(" ".join(tokens))
    
    return text


In [23]:
train_data['text']=train_data['text'].apply(preprocess)

In [26]:
train_data['text'].head(2)

0            coming border kill ,
1    im getting borderland kill ,
Name: text, dtype: object

In [25]:
validation_data['text']=validation_data['text'].apply(preprocess)

In [28]:
validation_data['text'].head(2)

0    bbc news - amazon bos jeff bezos reject claim ...
1    pay word function poorly chromebook ? :face_wi...
Name: text, dtype: object

In [33]:
x_train = train_data['text']
y_train = train_data['labels']

In [34]:
tfidf= TfidfVectorizer()
x_train_tfidf = tfidf.fit_transform(x_train)

## NAIVE BAYES CLASSIFIER

In [35]:
from sklearn.naive_bayes import MultinomialNB

In [37]:
naivebayes = MultinomialNB()
naivebayes.fit(x_train_tfidf, y_train)

In [39]:
x_validation = validation_data['text']
y_validation = validation_data['labels']

In [40]:
x_validation_tfidf = tfidf.transform(x_validation)

In [41]:
y_pred_nb = naivebayes.predict(x_validation_tfidf)

In [42]:
print("Naive Bayes Classifier:")
print(classification_report(y_validation, y_pred_nb))
print("Accuracy:", accuracy_score(y_validation, y_pred_nb))


Naive Bayes Classifier:
              precision    recall  f1-score   support

  Irrelevant       0.96      0.57      0.71       171
    Negative       0.72      0.91      0.80       266
     Neutral       0.92      0.71      0.80       285
    Positive       0.73      0.90      0.81       277

    accuracy                           0.79       999
   macro avg       0.83      0.77      0.78       999
weighted avg       0.82      0.79      0.79       999

Accuracy: 0.7917917917917918


## RANDOM FOREST CLASSIFIER

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
randomforest = RandomForestClassifier()

In [51]:
randomforest.fit(x_train_tfidf, y_train)

In [52]:
y_pred_rf = randomforest.predict(x_validation_tfidf)

In [53]:
print("\nRandom Forest Classifier:")
print(classification_report(y_validation, y_pred_rf))
print("Accuracy:", accuracy_score(y_validation, y_pred_rf))


Random Forest Classifier:
              precision    recall  f1-score   support

  Irrelevant       0.98      0.96      0.97       171
    Negative       0.95      0.98      0.96       266
     Neutral       0.97      0.95      0.96       285
    Positive       0.97      0.97      0.97       277

    accuracy                           0.96       999
   macro avg       0.97      0.96      0.97       999
weighted avg       0.97      0.96      0.96       999

Accuracy: 0.964964964964965
