In [4]:
import pandas as pd

# Load the dataset with proper encoding
df = pd.read_csv("test.csv", encoding="ISO-8859-1")

# Display the first few rows
print(df.head())


       textID                                               text sentiment  \
0  f87dea47db  Last session of the day  http://twitpic.com/67ezh   neutral   
1  96d74cb729   Shanghai is also really exciting (precisely -...  positive   
2  eee518ae67  Recession hit Veronique Branquinho, she has to...  negative   
3  01082688c6                                        happy bday!  positive   
4  33987a8ee5             http://twitpic.com/4w75p - I like it!!  positive   

  Time of Tweet Age of User      Country  Population -2020  Land Area (Km²)  \
0       morning        0-20  Afghanistan        38928346.0         652860.0   
1          noon       21-30      Albania         2877797.0          27400.0   
2         night       31-45      Algeria        43851044.0        2381740.0   
3       morning       46-60      Andorra           77265.0            470.0   
4          noon       60-70       Angola        32866272.0        1246700.0   

   Density (P/Km²)  
0             60.0  
1            1

In [6]:
def preprocess_text(text):
    text = str(text)  # convert to string
    text = re.sub(r'[^\w\s]', '', text.lower())  # remove punctuation and lowercase
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)


In [7]:
df['clean_text'] = df['text'].apply(preprocess_text)


In [8]:
df = df.dropna(subset=['text'])  # Drop rows where 'text' is NaN


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])


In [11]:
print(df.columns)


Index(['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User',
       'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)',
       'clean_text'],
      dtype='object')


In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['sentiment'])


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

# 1. Vectorize text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])

# 2. Prepare target variable (sentiment column)
y = df['sentiment']  # make sure it's the correct column name

# 3. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train SVM model
model = LinearSVC()
model.fit(X_train, y_train)

# 5. Prediction & Evaluation
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6294200848656294
Classification Report:
               precision    recall  f1-score   support

    negative       0.67      0.54      0.60       207
     neutral       0.57      0.68      0.62       286
    positive       0.70      0.64      0.67       214

    accuracy                           0.63       707
   macro avg       0.65      0.62      0.63       707
weighted avg       0.64      0.63      0.63       707





In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

# Prediction & Evaluation
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5954738330975955
Classification Report:
               precision    recall  f1-score   support

    negative       0.81      0.34      0.48       207
     neutral       0.52      0.83      0.64       286
    positive       0.70      0.54      0.61       214

    accuracy                           0.60       707
   macro avg       0.68      0.57      0.57       707
weighted avg       0.66      0.60      0.58       707



In [18]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])

# 2. Target variable
y = df['sentiment']

# 3. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Predict and Evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6548797736916548
Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.48      0.60       207
     neutral       0.58      0.78      0.66       286
    positive       0.73      0.65      0.69       214

    accuracy                           0.65       707
   macro avg       0.70      0.64      0.65       707
weighted avg       0.68      0.65      0.65       707

