importing modules


In [None]:
import nltk
nltk.download('punkt_tab')
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report

# Load the datasets

In [None]:
try:
    train_df = pd.read_csv('/content/drive/MyDrive/DATA/train_2kmZucJ.csv')
except FileNotFoundError:
    print("Ensure train_2kmZucJ.csv is uploaded.")
    train_df = pd.DataFrame({
        'id': range(3),
        'label': [0, 1, 0],
        'tweet': ['I love my new phone!', 'My laptop is so slow and buggy $&@*#', 'Just got the new headset, amazing quality.']
    })

# ---  Preprocessing Steps ---

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s\$&@\*#]', '', text)
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    return " ".join(cleaned_tokens)

train_df['cleaned_tweet'] = train_df['tweet'].apply(preprocess_text)


# --- Define features (X) and target (y) and Split Data ---

In [None]:
X = train_df['cleaned_tweet']
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Build and Train the SVM Pipeline ---


In [None]:
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=10000)),
    ('svc', SVC(kernel='linear', class_weight='balanced', random_state=42, probability=True))
])

print("Training the SVM model...")
svm_pipeline.fit(X_train, y_train)


# --- Evaluate the SVM Model ---

In [None]:

y_pred_svm = svm_pipeline.predict(X_val)
val_f1_svm = f1_score(y_val, y_pred_svm, average='weighted')

print("\nSVM Model Evaluation:")
print(f"Validation Weighted F1-Score: {val_f1_svm:.4f}\n")
print("Validation Classification Report:")
print(classification_report(y_val, y_pred_svm, target_names=['Not Negative (0)', 'Negative (1)']))

Training the SVM model...

SVM Model Evaluation:
Validation Weighted F1-Score: 0.8854

Validation Classification Report:
                  precision    recall  f1-score   support

Not Negative (0)       0.95      0.89      0.92      1179
    Negative (1)       0.73      0.86      0.79       405

        accuracy                           0.88      1584
       macro avg       0.84      0.87      0.85      1584
    weighted avg       0.89      0.88      0.89      1584



# --- Predict on the test data using the trained SVM pipeline ---

In [None]:
print("Making predictions on the test set with the SVM model...")
X_test = test_df['cleaned_tweet']
test_predictions_svm = svm_pipeline.predict(X_test)

# --- Create and save the submission file ---


In [None]:


submission_df_svm = pd.DataFrame({
    'id': test_df['id'],
    'label': test_predictions_svm
})

submission_df_svm.to_csv('submission_svm.csv', index=False)

print("\nSubmission file 'submission_svm.csv' created successfully!")
print("This file is ready for submission to the competition.")
print("\nFirst 5 rows of the new submission file:")
print(submission_df_svm.head())

Making predictions on the test set with the SVM model...

Submission file 'submission_svm.csv' created successfully!
This file is ready for submission to the competition.

First 5 rows of the new submission file:
     id  label
0  7921      1
1  7922      1
2  7923      1
3  7924      1
4  7925      1
