#### ***Preprocessing***

***Imports***

In [12]:
!pip install -U datasets
from google.colab import files
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

# Download NLTK resources if not already downloaded
nltk.download('stopwords', quiet=True)
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer





[nltk_data] Downloading package wordnet to /root/nltk_data...


***Datasets***


In [10]:
# Load datasets
uploaded = files.upload()

df_train = pd.read_json('train.json1', lines=True)
#df_train = pd.read_csv('train.json1', encoding='utf-8')
df_dev = pd.read_csv('dev.csv', encoding='utf-8')
df_test = pd.read_csv('test.csv', encoding='utf-8')
'''
df_train = pd.read_csv('../dataset/data/train.csv', encoding='utf-8')
df_dev = pd.read_csv('../dataset/data/dev.csv', encoding='utf-8')
df_test = pd.read_csv('../dataset/data/test.csv', encoding='utf-8')
'''

print("Train class distribution:")
print(df_train['label'].value_counts())

print("\nDev class distribution:")
print(df_dev['label'].value_counts())

print("\nTest class distribution:")
print(df_test['label'].value_counts())

Saving test.csv to test (2).csv
Saving train.json1 to train (2).json1
Saving dev.csv to dev (2).csv
Train class distribution:
label
1    408
0    342
2    250
Name: count, dtype: int64

Dev class distribution:
label
1    81
0    53
2    26
Name: count, dtype: int64

Test class distribution:
label
1    82
0    53
2    25
Name: count, dtype: int64


In [13]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# --- Text Preprocessing ---
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + '0123456789'))
    text = text.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Apply preprocessing and stopword removal
stop_words = set(stopwords.words('english'))

for df in [df_train, df_dev, df_test]:
    df['text'] = df['text'].apply(preprocess)
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


### ***Training***

In [14]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), sublinear_tf=True)
X_train = tfidf.fit_transform(df_train['text'])
X_dev = tfidf.transform(df_dev['text'])
X_test = tfidf.transform(df_test['text'])

y_train = df_train['label']
y_dev = df_dev['label']
y_test = df_test['label']

# Handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Hyperparameter
param_grid = [
    {
        'kernel': ['linear'],
        'C': [0.01, 0.1, 1, 10, 100]
    },
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.001]
    },
    {
        'kernel': ['poly'],
        'C': [0.1, 1],
        'gamma': ['scale', 'auto'],
        'degree': [2, 3]
    }
]

***Looking for Hyperparameters + SVM***

In [15]:
print("🔍 Performing extensive grid search on DEV set...\n")
grid_search = GridSearchCV(
    SVC(class_weight='balanced', random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train_resampled, y_train_resampled)

best_svm = grid_search.best_estimator_
print(f"✅ Best Parameters: {grid_search.best_params_}\n")


🔍 Performing extensive grid search on DEV set...

Fitting 5 folds for each of 29 candidates, totalling 145 fits
✅ Best Parameters: {'C': 1, 'kernel': 'linear'}



***DEV Set + Finetuning***

In [16]:
# Evaluate on DEV Set
y_dev_pred = best_svm.predict(X_dev)
print("📊 Evaluation on DEV set:")
print(classification_report(y_dev, y_dev_pred))
print(f'Dev Accuracy: {accuracy_score(y_dev, y_dev_pred):.4f}')

# Retrain on TRAIN + DEV using best params
print("\n🔁 Retraining on TRAIN + DEV with best parameters...\n")
X_combined = tfidf.fit_transform(pd.concat([df_train['text'], df_dev['text']]))
y_combined = pd.concat([y_train, y_dev])
X_comb_resampled, y_comb_resampled = smote.fit_resample(X_combined, y_combined)


📊 Evaluation on DEV set:
              precision    recall  f1-score   support

           0       0.83      0.91      0.86        53
           1       0.78      0.74      0.76        81
           2       0.52      0.50      0.51        26

    accuracy                           0.76       160
   macro avg       0.71      0.72      0.71       160
weighted avg       0.75      0.76      0.75       160

Dev Accuracy: 0.7562

🔁 Retraining on TRAIN + DEV with best parameters...



***Final Model***

In [17]:
final_model = SVC(
    kernel=grid_search.best_params_['kernel'],
    C=grid_search.best_params_['C'],
    gamma=grid_search.best_params_.get('gamma', 'scale'),
    degree=grid_search.best_params_.get('degree', 3),
    class_weight='balanced',
    random_state=42,
    probability=True  # Enable probability estimates
)

final_model.fit(X_comb_resampled, y_comb_resampled)


### ***Test***

In [18]:
# Final Test Set Evaluation
X_test_final = tfidf.transform(df_test['text'])
y_test_pred = final_model.predict(X_test_final)

print("🧪 Final Evaluation on TEST Set:")
print(classification_report(y_test, y_test_pred))
print(f'Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}')

🧪 Final Evaluation on TEST Set:
              precision    recall  f1-score   support

           0       0.84      0.68      0.75        53
           1       0.78      0.87      0.82        82
           2       0.65      0.68      0.67        25

    accuracy                           0.78       160
   macro avg       0.76      0.74      0.75       160
weighted avg       0.78      0.78      0.77       160

Test Accuracy: 0.7750


In [19]:
# Create a DataFrame with 'text', 'label', and 'prediction' columns
test_results = pd.DataFrame({
    'text': df_test['text'],
    'label': y_test,
    'prediction': y_test_pred
})

# Save the DataFrame to a new CSV file
test_results.to_csv('test_results.csv', index=False)


### **DEMO**


In [None]:
# Load the demo.csv (assuming it only has 'text' column)
df_demo = pd.read_csv('../dataset/data/demo.csv', encoding='utf-8')

# Preprocess the demo text data (same preprocessing as done for training data)
df_demo['text'] = df_demo['text'].apply(preprocess)
df_demo['text'] = df_demo['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Transform the demo text data using the trained TF-IDF vectorizer
X_demo = tfidf.transform(df_demo['text'])

# Predict using the final trained model
y_demo_pred = final_model.predict(X_demo)

# Create a DataFrame with 'text' and 'prediction' columns
demo_results = pd.DataFrame({
    'text': df_demo['text'],
    'prediction': y_demo_pred
})

# Save the predictions to a new CSV file
demo_results.to_csv('demo_predictions.csv', index=False)
print(demo_results)

                                                text  prediction
0  reference reuse material project created imple...           1
1  also saw continuation activity regarding light...           2
2  aim model screen worst emitter affected policy...           1
3  alignment un sdgs global citizen lg chem makin...           1
4  outcome bank explained would winding fossil fu...           1
5  climate change producing change weather enviro...           0


### **USER INPUT**


In [None]:
def predict_user_input_with_confidence(user_text):
    # Preprocess the user input (same preprocessing steps as for training)
    user_text = preprocess(user_text)
    user_text = ' '.join([word for word in user_text.split() if word not in stop_words])

    # Transform the input text using the trained TF-IDF vectorizer
    X_user = tfidf.transform([user_text])

    # Predict using the final trained model
    user_prediction = final_model.predict(X_user)

    # Get the probability of the predicted class
    user_prediction_proba = final_model.predict_proba(X_user)

    # Get the confidence level (probability) for the predicted class
    confidence = user_prediction_proba[0][user_prediction[0]]

    # Output the prediction and confidence
    return user_prediction[0], confidence

# Get user input (text)
user_input = input("Enter the text for prediction: ")

# Make the prediction and get the confidence level
prediction, confidence = predict_user_input_with_confidence(user_input)

label_mapping = {0: 'Risk', 1: 'Neutral', 2: 'Opportunity'}  # Adjust based on your label encoding

# Output the prediction and confidence level
print(f"Predicted label for the input text: {label_mapping[prediction]}")
print(f"Confidence level: {confidence:.4f}")


Predicted label for the input text: 1
Confidence level: 0.7632


In [21]:
#Download model and vectorizer
import os

os.makedirs("climate_sentiment_model", exist_ok=True)
import json
import joblib

# Save the model and vectorizer
joblib.dump(final_model, "climate_sentiment_model/model.joblib")
joblib.dump(tfidf, "climate_sentiment_model/vectorizer.joblib")

import zipfile

zip_path = "climate_sentiment_model.zip"
with zipfile.ZipFile(zip_path, "w") as zipf:
    for root, dirs, files in os.walk("climate_sentiment_model"):
        for file in files:
            zipf.write(os.path.join(root, file),
                       arcname=os.path.join("climate_sentiment_model", file))



from google.colab import files
files.download("climate_sentiment_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>