In [164]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [165]:
filePath = 'twitter_training.csv'
data = pd.read_csv(filePath)

In [166]:
data.columns = ['TweetID', 'Borderlands', 'Sentiment', 'Comment']
data.dropna(subset=['Comment'], inplace=True)

uniqueSentiments = data['Sentiment'].unique()
sampleData = data.head()

uniqueSentiments, sampleData

(array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object),
    TweetID  Borderlands Sentiment  \
 0     2401  Borderlands  Positive   
 1     2401  Borderlands  Positive   
 2     2401  Borderlands  Positive   
 3     2401  Borderlands  Positive   
 4     2401  Borderlands  Positive   
 
                                              Comment  
 0  I am coming to the borders and I will kill you...  
 1  im getting on borderlands and i will kill you ...  
 2  im coming on borderlands and i will murder you...  
 3  im getting on borderlands 2 and i will murder ...  
 4  im getting into borderlands and i can murder y...  )

In [167]:
data['Comment'] = data['Comment'].str.lower().str.replace('[^\w\s]', ' ')

labelEncoder = LabelEncoder()
data['Sentiment'] = labelEncoder.fit_transform(data['Sentiment'])

tfidfVectorizer = TfidfVectorizer(max_features= 5000 ,stop_words='english')
xTfidf = tfidfVectorizer.fit_transform(data['Comment'])
y = data['Sentiment']

xTfidf.shape, y.shape, dict(zip(labelEncoder.classes_, labelEncoder.transform(labelEncoder.classes_)))

((73995, 5000),
 (73995,),
 {'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3})

In [168]:
xTrain, xTest, yTrain, yTest = train_test_split(xTfidf, y, test_size=0.2, random_state=42)

xgbModel = XGBClassifier(use_label_encoder = False, eval_metric = 'mlogloss')
xgbModel.fit(xTrain, yTrain)

yPred = xgbModel.predict(xTest)

classificationReportData = classification_report(yTest, yPred, target_names = labelEncoder.classes_)
classificationReportData

Parameters: { "use_label_encoder" } are not used.



'              precision    recall  f1-score   support\n\n  Irrelevant       0.75      0.36      0.48      2624\n    Negative       0.58      0.84      0.69      4463\n     Neutral       0.72      0.56      0.63      3589\n    Positive       0.66      0.69      0.67      4123\n\n    accuracy                           0.64     14799\n   macro avg       0.68      0.61      0.62     14799\nweighted avg       0.67      0.64      0.63     14799\n'

In [169]:
accuracy = accuracy_score(yTest, yPred)

print("Accuracy of the model: {:.2f}%".format(accuracy * 100))

Accuracy of the model: 64.42%


In [170]:
def predictSentiment(text):
    textInput = text.lower().replace('[^\w\s]', ' ')
    textTfidf = tfidfVectorizer.transform([textInput])
    prediction = xgbModel.predict(textTfidf)

    sentimentLabel = labelEncoder.inverse_transform(prediction)[0]

    return sentimentLabel

In [171]:
positiveText = "I really love this game! It's so exciting and well-made."
negativeText = "I am very disappointed with this game. It crashes frequently and lacks good content."
neutralText = "This game was released yesterday. It features standard gameplay mechanics."
irrelevantText = "It’s going to rain tomorrow. Better bring an umbrella!"

In [172]:
predictedPositiveSentiment = predictSentiment(positiveText)
predictedNegativeSentiment = predictSentiment(negativeText)
predictedNeutralSentiment = predictSentiment(neutralText)
predictedIrrelevantSentiment = predictSentiment(irrelevantText)



print("Predicted Sentiment [Positive]: ", predictedPositiveSentiment)
print("Predicted Sentiment [Negative]: ", predictedNegativeSentiment)
print("Predicted Sentiment [Neutral]: ", predictedNeutralSentiment)
print("Predicted Sentiment [Irrelevant]: ", predictedIrrelevantSentiment)

Predicted Sentiment [Positive]:  Positive
Predicted Sentiment [Negative]:  Negative
Predicted Sentiment [Neutral]:  Negative
Predicted Sentiment [Irrelevant]:  Neutral
