# **Import Dataset**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/Project Portofolio/Sentiment Analysis/sentimentdataset.csv"
df = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,728,732,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
728,729,733,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:15:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,USA,2018,6,22,14
729,730,734,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:30:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,Canada,2019,4,5,17
730,731,735,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:45:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,UK,2020,2,29,20


# **Exploratory Data Analysis**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  732 non-null    int64  
 1   Unnamed: 0    732 non-null    int64  
 2   Text          732 non-null    object 
 3   Sentiment     732 non-null    object 
 4   Timestamp     732 non-null    object 
 5   User          732 non-null    object 
 6   Platform      732 non-null    object 
 7   Hashtags      732 non-null    object 
 8   Retweets      732 non-null    float64
 9   Likes         732 non-null    float64
 10  Country       732 non-null    object 
 11  Year          732 non-null    int64  
 12  Month         732 non-null    int64  
 13  Day           732 non-null    int64  
 14  Hour          732 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 85.9+ KB


In [5]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0.1,0
Unnamed: 0,0
Text,0
Sentiment,0
Timestamp,0
User,0
Platform,0
Hashtags,0
Retweets,0
Likes,0


In [6]:
#mengubah kalimat menjadi huruf kecil semua
dt = df["Text"].str.lower()
print(dt)

0       enjoying a beautiful day at the park!        ...
1       traffic was terrible this morning.           ...
2       just finished an amazing workout! 💪          ...
3       excited about the upcoming weekend getaway!  ...
4       trying out a new recipe for dinner tonight.  ...
                             ...                        
727    collaborating on a science project that receiv...
728    attending a surprise birthday party organized ...
729    successfully fundraising for a school charity ...
730    participating in a multicultural festival, cel...
731    organizing a virtual talent show during challe...
Name: Text, Length: 732, dtype: object


In [7]:
#menghapus simbol atau tanda baca unik yang tidak perlu
def remove_punctuation(text):
  return re.sub(r'[^\w\s]', '', text)

dt = dt.apply(remove_punctuation)
print(dt)

0       enjoying a beautiful day at the park         ...
1       traffic was terrible this morning            ...
2       just finished an amazing workout                
3       excited about the upcoming weekend getaway   ...
4       trying out a new recipe for dinner tonight   ...
                             ...                        
727    collaborating on a science project that receiv...
728    attending a surprise birthday party organized ...
729    successfully fundraising for a school charity ...
730    participating in a multicultural festival cele...
731    organizing a virtual talent show during challe...
Name: Text, Length: 732, dtype: object


In [8]:
dt.str.strip()
print(dt)

0       enjoying a beautiful day at the park         ...
1       traffic was terrible this morning            ...
2       just finished an amazing workout                
3       excited about the upcoming weekend getaway   ...
4       trying out a new recipe for dinner tonight   ...
                             ...                        
727    collaborating on a science project that receiv...
728    attending a surprise birthday party organized ...
729    successfully fundraising for a school charity ...
730    participating in a multicultural festival cele...
731    organizing a virtual talent show during challe...
Name: Text, Length: 732, dtype: object


In [9]:
#menghapus angka pada text
rev_num=''.join([i for i in dt if not i.isdigit()])
rev_num

' enjoying a beautiful day at the park               traffic was terrible this morning                  just finished an amazing workout                 excited about the upcoming weekend getaway         trying out a new recipe for dinner tonight         feeling grateful for the little things in life     rainy days call for cozy blankets and hot cocoa    the new movie release is a mustwatch              political discussions heating up on the timeline   missing summer vibes and beach days                just published a new blog post check it out       feeling a bit under the weather today              exploring the citys hidden gems                   new year new fitness goals                      technology is changing the way we live             reflecting on the past and looking ahead           just adopted a cute furry friend                latenight gaming session with friends             attending a virtual conference on ai               winter blues got me feeling low          

In [10]:
print(dt)

0       enjoying a beautiful day at the park         ...
1       traffic was terrible this morning            ...
2       just finished an amazing workout                
3       excited about the upcoming weekend getaway   ...
4       trying out a new recipe for dinner tonight   ...
                             ...                        
727    collaborating on a science project that receiv...
728    attending a surprise birthday party organized ...
729    successfully fundraising for a school charity ...
730    participating in a multicultural festival cele...
731    organizing a virtual talent show during challe...
Name: Text, Length: 732, dtype: object


In [11]:
# Memisahkan setiap kata dalam setiap dokumen teks
print('HASIL TOKENISASI')
hasil_tokenisasi = dt.apply(lambda x: x.split())
print(hasil_tokenisasi)

HASIL TOKENISASI
0           [enjoying, a, beautiful, day, at, the, park]
1                [traffic, was, terrible, this, morning]
2                 [just, finished, an, amazing, workout]
3      [excited, about, the, upcoming, weekend, getaway]
4      [trying, out, a, new, recipe, for, dinner, ton...
                             ...                        
727    [collaborating, on, a, science, project, that,...
728    [attending, a, surprise, birthday, party, orga...
729    [successfully, fundraising, for, a, school, ch...
730    [participating, in, a, multicultural, festival...
731    [organizing, a, virtual, talent, show, during,...
Name: Text, Length: 732, dtype: object


In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
#menghapus kata tidak bermakna seperti a, i, and, dan sebagainya
stop_words = set(stopwords.words('english'))
hasil_tokenisasi = hasil_tokenisasi.apply(lambda x: [word for word in x if word not in stop_words])
hasil_tokenisasi

Unnamed: 0,Text
0,"[enjoying, beautiful, day, park]"
1,"[traffic, terrible, morning]"
2,"[finished, amazing, workout]"
3,"[excited, upcoming, weekend, getaway]"
4,"[trying, new, recipe, dinner, tonight]"
...,...
727,"[collaborating, science, project, received, re..."
728,"[attending, surprise, birthday, party, organiz..."
729,"[successfully, fundraising, school, charity, i..."
730,"[participating, multicultural, festival, celeb..."


In [14]:
#mengubah kata ke bentuk dasar (lemmatization)
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

dt = hasil_tokenisasi.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
dt

Unnamed: 0,Text
0,"[enjoying, beautiful, day, park]"
1,"[traffic, terrible, morning]"
2,"[finished, amazing, workout]"
3,"[excited, upcoming, weekend, getaway]"
4,"[trying, new, recipe, dinner, tonight]"
...,...
727,"[collaborating, science, project, received, re..."
728,"[attending, surprise, birthday, party, organiz..."
729,"[successfully, fundraising, school, charity, i..."
730,"[participating, multicultural, festival, celeb..."


In [16]:
#menggabungkan kembali kata menjadi kalimat
dt = dt.apply(lambda x: ' '.join(x))
print(dt)

0                            enjoying beautiful day park
1                               traffic terrible morning
2                               finished amazing workout
3                       excited upcoming weekend getaway
4                       trying new recipe dinner tonight
                             ...                        
727    collaborating science project received recogni...
728    attending surprise birthday party organized fr...
729    successfully fundraising school charity initia...
730    participating multicultural festival celebrati...
731    organizing virtual talent show challenging tim...
Name: Text, Length: 732, dtype: object


In [18]:
print(len(dt), len(df))

732 732


In [19]:
df['Text'] = dt.values  # Menambahkan kolom teks bersih ke df
df = df[['Text', 'Sentiment']]  # Pilih hanya kolom yang dibutuhkan

In [20]:
df

Unnamed: 0,Text,Sentiment
0,enjoying beautiful day park,Positive
1,traffic terrible morning,Negative
2,finished amazing workout,Positive
3,excited upcoming weekend getaway,Positive
4,trying new recipe dinner tonight,Neutral
...,...,...
727,collaborating science project received recogni...,Happy
728,attending surprise birthday party organized fr...,Happy
729,successfully fundraising school charity initia...,Happy
730,participating multicultural festival celebrati...,Happy


In [21]:
print(df["Sentiment"].value_counts())

Sentiment
Positive               44
Joy                    42
Excitement             32
Happy                  14
Neutral                14
                       ..
Vibrancy                1
Culinary Adventure      1
Mesmerizing             1
Thrilling Journey       1
Winter Magic            1
Name: count, Length: 279, dtype: int64


In [22]:
print(df["Sentiment"].unique())  # Lihat kategori sentimen yang ada

[' Positive  ' ' Negative  ' ' Neutral   ' ' Anger        '
 ' Fear         ' ' Sadness      ' ' Disgust      ' ' Happiness    '
 ' Joy          ' ' Love         ' ' Amusement    ' ' Enjoyment    '
 ' Admiration   ' ' Affection    ' ' Awe          ' ' Disappointed '
 ' Surprise     ' ' Acceptance   ' ' Adoration    ' ' Anticipation '
 ' Bitter       ' ' Calmness     ' ' Confusion    ' ' Excitement   '
 ' Kind         ' ' Pride        ' ' Shame        ' ' Confusion '
 ' Excitement ' ' Shame ' ' Elation       ' ' Euphoria      '
 ' Contentment   ' ' Serenity      ' ' Gratitude     ' ' Hope          '
 ' Empowerment   ' ' Compassion    ' ' Tenderness    ' ' Arousal       '
 ' Enthusiasm    ' ' Fulfillment  ' ' Reverence     ' ' Compassion'
 ' Fulfillment   ' ' Reverence ' ' Elation   ' ' Despair         '
 ' Grief           ' ' Loneliness      ' ' Jealousy        '
 ' Resentment      ' ' Frustration     ' ' Boredom         '
 ' Anxiety         ' ' Intimidation    ' ' Helplessness    '
 ' 

In [23]:
df.loc[:, "Sentiment"] = df["Sentiment"].str.strip().str.title()

In [24]:
import pandas as pd

# Daftar sentimen berdasarkan kategori
positive_sentiments = {"Positive", "Happiness", "Joy", "Love", "Excitement", "Hope", "Admiration", "Pride", "Gratitude", "Euphoria", "Optimism", "Satisfaction", "Enchantment", "Serenity", "Amusement"}
negative_sentiments = {"Negative", "Anger", "Fear", "Sadness", "Disgust", "Disappointed", "Despair", "Grief", "Loneliness", "Jealousy", "Resentment", "Frustration", "Boredom", "Anxiety", "Regret", "Betrayal", "Sorrow", "Hate"}
neutral_sentiments = {"Neutral", "Indifference", "Confusion", "Ambivalence", "Reflection", "Calmness", "Acceptance"}

def categorize_sentiment(sentiment):
    sentiment = sentiment.strip()  # Hapus spasi berlebih
    if sentiment in positive_sentiments:
        return "Positive"
    elif sentiment in negative_sentiments:
        return "Negative"
    else:
        return "Neutral"

df.loc[:, "Sentiment"] = df["Sentiment"].apply(categorize_sentiment)

# Cek hasilnya
print(df["Sentiment"].unique())

['Positive' 'Negative' 'Neutral']


In [25]:
df["Sentiment"].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Neutral,448
Positive,201
Negative,83


In [26]:
from sklearn.preprocessing import LabelEncoder

df = df.copy()
df['Sentiment'] = LabelEncoder().fit_transform(df['Sentiment']).astype(int)
print(df.dtypes)

Text         object
Sentiment     int64
dtype: object


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       732 non-null    object
 1   Sentiment  732 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.6+ KB


# **Pemodelan**

## **Logistic Regression**

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Konversi teks ke fitur numerik dengan TF-IDF (pakai bigram)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
x = vectorizer.fit_transform(df['Text'])
y = df['Sentiment'].astype(int)

# Split sebelum SMOTE
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

# Cek distribusi sebelum SMOTE
print("Distribusi sebelum SMOTE:", Counter(y_train))

# Terapkan SMOTE hanya di training set
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# Cek distribusi setelah SMOTE
print("Distribusi setelah SMOTE:", Counter(y_train_resampled))

# Gunakan Logistic Regression dengan class balancing
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(x_train_resampled, y_train_resampled)  # Gunakan data hasil SMOTE

# Prediksi
y_pred = model.predict(x_test)

# Evaluasi
print(classification_report(y_test, y_pred))

Distribusi sebelum SMOTE: Counter({1: 358, 2: 161, 0: 66})
Distribusi setelah SMOTE: Counter({0: 358, 1: 358, 2: 358})
              precision    recall  f1-score   support

           0       0.50      0.24      0.32        17
           1       0.69      0.86      0.77        90
           2       0.64      0.45      0.53        40

    accuracy                           0.67       147
   macro avg       0.61      0.51      0.54       147
weighted avg       0.66      0.67      0.65       147



## **Random Forest**

In [29]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
rf_model.fit(x_train_resampled, y_train_resampled)

y_pred_rf = rf_model.predict(x_test)

print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.33      0.12      0.17        17
           1       0.65      0.88      0.75        90
           2       0.63      0.30      0.41        40

    accuracy                           0.63       147
   macro avg       0.54      0.43      0.44       147
weighted avg       0.61      0.63      0.59       147



## **XGBoost**

In [30]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=200, objective='multi:softmax', num_class=3, random_state=42)
xgb_model.fit(x_train_resampled, y_train_resampled)

y_pred_xgb = xgb_model.predict(x_test)

print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.62      0.47      0.53        17
           1       0.73      0.79      0.76        90
           2       0.59      0.55      0.57        40

    accuracy                           0.69       147
   macro avg       0.65      0.60      0.62       147
weighted avg       0.68      0.69      0.68       147

