**CELL 1 â€” IMPORT LIBRARY**

In [16]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


**CELL 2 â€” LOAD DATASET**

In [17]:
df = pd.read_csv('/content/sentimentdataset.csv')
df.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! ðŸ’ª ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


**CELL 3 â€” CEK KOLOM**

In [18]:
df.columns


Index(['Unnamed: 0.1', 'Unnamed: 0', 'Text', 'Sentiment', 'Timestamp', 'User',
       'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month',
       'Day', 'Hour'],
      dtype='object')

**CELL 4 â€” HAPUS KOLOM SAMPAH**

In [19]:
df = df.loc[:, ~df.columns.str.contains('Unnamed')]
df.columns


Index(['Text', 'Sentiment', 'Timestamp', 'User', 'Platform', 'Hashtags',
       'Retweets', 'Likes', 'Country', 'Year', 'Month', 'Day', 'Hour'],
      dtype='object')

**CELL 5 â€” CEK ISI SENTIMENT**

In [20]:
df['Sentiment'].value_counts(dropna=False)


Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Positive,44
Joy,42
Excitement,32
Happy,14
Neutral,14
...,...
Vibrancy,1
Culinary Adventure,1
Mesmerizing,1
Thrilling Journey,1


**CELL 6 â€” CLEANING TEXT**

In [21]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_text'] = df['Text'].apply(clean_text)
df[['Text', 'clean_text']].head()


Unnamed: 0,Text,clean_text
0,Enjoying a beautiful day at the park! ...,enjoying a beautiful day at the park ...
1,Traffic was terrible this morning. ...,traffic was terrible this morning ...
2,Just finished an amazing workout! ðŸ’ª ...,just finished an amazing workout
3,Excited about the upcoming weekend getaway! ...,excited about the upcoming weekend getaway ...
4,Trying out a new recipe for dinner tonight. ...,trying out a new recipe for dinner tonight ...


**CELL 7 â€” ENCODING LABEL**

In [22]:
# Paksa semua jadi string
df['Sentiment'] = df['Sentiment'].astype(str)

# Bersihkan tulisan
df['Sentiment'] = df['Sentiment'].str.strip().str.lower()

# Mapping ke angka
df['Sentiment'] = df['Sentiment'].map({
    'positive': 1,
    'negative': 0
})

# Buang data gagal mapping
df = df.dropna(subset=['Sentiment'])

# Pastikan bersih
df['Sentiment'].value_counts()


Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
1.0,45
0.0,4


**CELL 8 â€” SPLIT DATA**

In [23]:
X = df['clean_text']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("Train:", X_train.shape)
print("Test :", X_test.shape)


Train: (39,)
Test : (10,)


**CELL 9 â€” TF-IDF**

In [24]:
vectorizer = TfidfVectorizer(stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


**CELL 10 â€” MODELING**

In [25]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


**CELL 11 â€” PREDIKSI**

In [26]:
y_pred = model.predict(X_test_tfidf)


**CELL 12 â€” EVALUASI**

In [27]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Accuracy: 1.0

Classification Report:

              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

Confusion Matrix:

[[10]]




**CELL 13 â€” TES MANUAL**

In [28]:
contoh = ["this product is very bad and disappointing"]

contoh_clean = [clean_text(contoh[0])]
contoh_tfidf = vectorizer.transform(contoh_clean)

hasil = model.predict(contoh_tfidf)

if hasil[0] == 1:
    print("Sentimen: POSITIF")
else:
    print("Sentimen: NEGATIF")


Sentimen: POSITIF


In [29]:
import joblib

# Simpan model
joblib.dump(model, 'sentiment_model.pkl')

# Simpan vectorizer TF-IDF
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model dan vectorizer berhasil disimpan")


Model dan vectorizer berhasil disimpan
