In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
#validation dataset
val = pd.read_csv("twitter_validation.csv", header=None)

#training dataset
train = pd.read_csv("twitter_training.csv", header=None)

In [3]:
train.columns=['id', 'information','type', 'text']
train.head()

Unnamed: 0,id,information,type,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
val.columns=['id','information','type','text']
val.head()

Unnamed: 0,id,information,type,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [5]:
train_data = train
train_data

Unnamed: 0,id,information,type,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [6]:
val_data = val
val_data

Unnamed: 0,id,information,type,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [7]:
#Text transformation
import re

train_data["lower"]=train_data.text.str.lower() #lowercasing
train_data["lower"]=[str(data) for data in train_data.lower] #converting all to string
train_data["lower"]=train_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #regex

val_data["lower"]=val_data.text.str.lower() #lowercasing
val_data["lower"]=[str(data) for data in val_data.lower] #converting all to string
val_data["lower"]=val_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #regex

In [8]:
train_data.head()
#val_data.head()

Unnamed: 0,id,information,type,text,lower
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...


In [9]:
# Function to clean data
def clean_data(df):
    # Create a copy of the DataFrame
    df = df.copy()
    
    # Remove rows with NaN values in 'text' or 'sentiment' columns
    df = df.dropna(subset=['text', 'type'])
    
    # Replace any remaining NaN values in 'text' with an empty string
    df.loc[:, 'text'] = df['text'].fillna('')
    
    return df

In [10]:
# Clean the datasets
train_data = clean_data(train_data)
val_data = clean_data(val_data)

In [11]:
# Print dataset sizes after cleaning
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

Training set size: 73996
Validation set size: 1000


In [12]:
# Prepare the datasets
X_train, y_train = train_data['text'], train_data['type']
X_val, y_val = val_data['text'], val_data['type']
print(X_train)

0        im getting on borderlands and i will murder yo...
1        I am coming to the borders and I will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands 2 and i will murder ...
                               ...                        
74677    Just realized that the Windows partition of my...
74678    Just realized that my Mac window partition is ...
74679    Just realized the windows partition of my Mac ...
74680    Just realized between the windows partition of...
74681    Just like the windows partition of my Mac is l...
Name: text, Length: 73996, dtype: object


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

In [14]:
# Define models and parameters for grid search
models = {
    'Naive Bayes': (MultinomialNB(), {'alpha': [0.1, 0.5, 1.0]}),
    'SVM': (LinearSVC(), {'C': [0.1, 1, 10]})
}

In [15]:
# Perform grid search with cross-validation
best_model = None
best_score = 0

for name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=5, scoring='f1_macro')
    grid_search.fit(X_train_vectorized, y_train)
    
    # Evaluate on validation set
    val_score = grid_search.score(X_val_vectorized, y_val)
    print(f"{name} best validation F1-score: {val_score:.4f}")
    print(f"Best parameters: {grid_search.best_params_}")
    
    if val_score > best_score:
        best_score = val_score
        best_model = grid_search.best_estimator_

Naive Bayes best validation F1-score: 0.7022
Best parameters: {'alpha': 1.0}




SVM best validation F1-score: 0.7778
Best parameters: {'C': 0.1}


In [16]:
# Save the best model
import joblib
joblib.dump(best_model, 'sentiment_analysis_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

In [17]:
# Load the trained model and vectorizer
model = joblib.load('sentiment_analysis_model.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')

In [31]:
def predict_sentiment(text):
    if pd.isna(text) or not isinstance(text, str) or text.strip() == '':
        return "Neutral"  # Return neutral for empty, NaN, or non-string input
    vectorized_text = vectorizer.transform([text])
    prediction = model.predict(vectorized_text)[0]
    return prediction

In [32]:
def get_sentiment_score(sentiment):
    if sentiment == 'Positive':
        return 1
    elif sentiment == 'Negative':
        return -1
    elif sentiment == 'Neutral':  # Neutral
        return 0

In [33]:
def analyze_post_sentiment(post_description, comments):
    # Ensure post_description is a string
    post_description = str(post_description) if post_description is not None else ""
    
    # Analyze post description
    post_sentiment = predict_sentiment(post_description)
    post_score = get_sentiment_score(post_sentiment)
    
    # Ensure comments is a list and remove any None values
    comments = [str(comment) for comment in (comments or []) if comment is not None]
    
    # Analyze comments
    comment_sentiments = [predict_sentiment(comment) for comment in comments if comment.strip()]
    comment_scores = [get_sentiment_score(sentiment) for sentiment in comment_sentiments]
    
    # Calculate average sentiment score
    all_scores = [post_score] + comment_scores
    average_score = sum(all_scores) / len(all_scores) if all_scores else 0
    
    # Determine overall sentiment based on average score
    if average_score > 0.3:
        overall_sentiment = 'Positive'
    elif average_score < -0.3:
        overall_sentiment = 'Negative'
    else:
        overall_sentiment = 'Neutral'
    
    return {
        'post_sentiment': post_sentiment,
        'comment_sentiments': comment_sentiments,
        'average_score': average_score,
        'overall_sentiment': overall_sentiment
    }

In [25]:
# Example usage
post_description = "I've been having issues with the new software update. It's frustrating!"

comments = [
    "its good",
    "i haven't faced any poblems with it",
    "I actually find the new update quite useful.",
    "it good actually"
]

In [34]:
df = pd.read_csv("issues_and_comments.csv")

df.head()

Unnamed: 0,issue_id,title,description,comments
0,66facf9773d4d62ba79a7fab,Lack of proper drainage system,This is the condition of our roads during heav...,"['asd', 'sad', 'mero ghar pani dubyo :(', 'sad*']"
1,66fbf5ae9651f13398e0f5ce,Potholes in road,There are potholes in the street in my area an...,"['dayum', 'woohooo']"
2,66fbfa3d9651f13398e0f64d,Roadside parking,Due to lack of parking facilities we have to p...,['new comment']
3,66fc16a35e1271fd96cb27d5,Build a park at river bank,There are empty lands near river bank which ca...,[]
4,66fce63f4c8d1e09fe6620df,Build a park in this empty space,The vacant lot at Khusibun has been unused for...,['agreed']


In [39]:
def process_issues(df):
    for _, row in df.iterrows():
        try:
            post_description = row['description']
            comments = eval(row['comments']) if isinstance(row['comments'], str) else row['comments']
            print(row['comments'])
            result = analyze_post_sentiment(post_description, comments)
            print(f"Issue ID: {row['issue_id']}")
            print(f"Title: {row['title']}")
            print(f"Overall Sentiment: {result['overall_sentiment']}")
            print("---")
        except Exception as e:
            print(f"Error processing issue {row['issue_id']}: {str(e)}")

In [40]:
process_issues(df)

['asd', 'sad', 'mero ghar pani dubyo :(', 'sad*']
Error processing issue 66facf9773d4d62ba79a7fab: unsupported operand type(s) for +: 'int' and 'NoneType'
['dayum', 'woohooo']
Issue ID: 66fbf5ae9651f13398e0f5ce
Title: Potholes in road
Overall Sentiment: Positive
---
['new comment']
Issue ID: 66fbfa3d9651f13398e0f64d
Title: Roadside parking
Overall Sentiment: Negative
---
[]
Issue ID: 66fc16a35e1271fd96cb27d5
Title: Build a park at river bank
Overall Sentiment: Neutral
---
['agreed']
Error processing issue 66fce63f4c8d1e09fe6620df: unsupported operand type(s) for +: 'int' and 'NoneType'
[]
Issue ID: 66fd79b8a218577fe38f1c19
Title: floodd 
Overall Sentiment: Positive
---
['working ???']
Issue ID: 6704941df924f59bfe46a69d
Title: testing for province
Overall Sentiment: Negative
---
['yo']
Issue ID: 67049984be4d7e6190043c0c
Title: new issue
Overall Sentiment: Neutral
---
['comment test']
Issue ID: 670dfd338e757c76ef60d81c
Title: upvote test
Overall Sentiment: Neutral
---
[]
Issue ID: 671159

In [27]:
result = analyze_post_sentiment(post_description, comments)
print(f"Post Sentiment: {result['post_sentiment']}")
print(f"Comment Sentiments: {result['comment_sentiments']}")
print(f"Average Sentiment Score: {result['average_score']:.2f}")
print(f"Overall Sentiment: {result['overall_sentiment']}")

Post Sentiment: Negative
Comment Sentiments: ['Positive', 'Negative', 'Positive', 'Positive']
Average Sentiment Score: 0.20
Overall Sentiment: Neutral
