<a href="https://colab.research.google.com/github/abir16-tech/BRAINWAVE_MATRIX_INTERN-/blob/main/BRAINWAVE_MATRIX_SOLUTION_TASK_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import requests
import os
import re
import zipfile

# Download the dataset zip file using the provided curl command
# Note: Kaggle datasets often require authentication. This direct URL might not work without it.
# A more robust approach for Kaggle data in Colab is using the Kaggle API.
# However, I will use the provided curl command as requested.
download_url = 'https://www.kaggle.com/api/v1/datasets/download/kazanova/sentiment140'
zip_file_path = '/content/sentiment140.zip'
extracted_csv_path = '/content/training.1600000.processed.noemoticon.csv'


# Using shell command to download
get_ipython().system(f'curl -L -o {zip_file_path} {download_url}')


# Assuming the zip contains 'training.1600000.processed.noemoticon.csv'
# We'll read directly from the zip
file_path = zip_file_path # Pointing to the zip file

# Check if the zip file was downloaded
if not os.path.exists(file_path):
    print(f"Error: Downloaded zip file not found at {file_path}. Please ensure the download was successful (Kaggle authentication might be required).")
else:
    # Read directly from the zip file
    # Using engine='python' and on_bad_lines='skip' for robustness
    try:
        df = pd.read_csv(file_path, encoding='latin-1', header=None, compression='zip', on_bad_lines='skip', engine='python')
        df = df[[0, 5]]
        df.columns = ['polarity', 'text']
        print(df.head())

        # Filter out neutral sentiment (polarity 2)
        df = df[df.polarity != 2]

        # Map polarity values 0 to 0 and 4 to 1
        df['polarity'] = df['polarity'].map({0: 0, 4: 1})

        # Shuffle the DataFrame
        df = df.sample(frac=1, random_state=42)

        print(df['polarity'].value_counts())

        def clean_text(text):
            if text is None:  # Handle None values
                return ""
            text = text.lower()
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
            text = re.sub(r'@\w+', '', text) # Remove mentions
            text = re.sub(r'#\w+', '', text) # Remove hashtags
            text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
            return text

        df['clean_text'] = df['text'].apply(clean_text)

        print(df[['text', 'clean_text']].head())

        X_train, X_test, y_train, y_test = train_test_split(
            df['clean_text'],
            df['polarity'],
            test_size=0.2,
            random_state=42,
            stratify=df['polarity'] # Added stratification to ensure both classes are in train/test splits
        )

        print("Train size:", len(X_train))
        print("Test size:", len(X_test))

        # Check if both classes are present in training data after split
        if len(y_train.value_counts()) < 2:
             print("Error: Training data does not contain both classes after split. Check data loading and filtering.")
        else:
            vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)

            print("TF-IDF shape (train):", X_train_tfidf.shape)
            print("TF-IDF shape (test):", X_test_tfidf.shape)

            # --- Model Training and Evaluation ---

            # Bernoulli Naive Bayes
            bnb = BernoulliNB()
            bnb.fit(X_train_tfidf, y_train)
            bnb_pred = bnb.predict(X_test_tfidf)
            print("\nBernoulli Naive Bayes Accuracy:", accuracy_score(y_test, bnb_pred))
            print("\nBernoulliNB Classification Report:\n", classification_report(y_test, bnb_pred))

            # Linear SVM
            svm = LinearSVC(max_iter=1000)
            svm.fit(X_train_tfidf, y_train)
            svm_pred = svm.predict(X_test_tfidf)
            print("\nSVM Accuracy:", accuracy_score(y_test, svm_pred))
            print("\nSVM Classification Report:\n", classification_report(y_test, svm_pred))

            # Logistic Regression
            logreg = LogisticRegression(max_iter=1000) # Increased max_iter
            logreg.fit(X_train_tfidf, y_train)
            logreg_pred = logreg.predict(X_test_tfidf)
            print("\nLogistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
            print("\nLogistic Regression Classification Report:\n", classification_report(y_test, logreg_pred))

            # --- Sample Predictions ---
            sample_tweets = ["I love this!", "I hate that!", "It was okay, not great."]
            # Clean sample tweets before vectorizing and predicting
            sample_vec = vectorizer.transform([clean_text(tweet) for tweet in sample_tweets])

            print("\nSample Predictions:")
            print("BernoulliNB:", bnb.predict(sample_vec))
            print("SVM:", svm.predict(sample_vec))
            print("Logistic Regression:", logreg.predict(sample_vec))

    except FileNotFoundError:
         print(f"Error: Expected CSV file not found within the zip at {extracted_csv_path}.")
    except Exception as e:
        print(f"An error occurred during file processing or model training: {e}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 80.9M  100 80.9M    0     0  52.6M      0  0:00:01  0:00:01 --:--:-- 67.2M
   polarity                                               text
0         0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1         0  is upset that he can't update his Facebook by ...
2         0  @Kenichan I dived many times for the ball. Man...
3         0    my whole body feels itchy and like its on fire 
4         0  @nationwideclass no, it's not behaving at all....
polarity
0    800000
1    800000
Name: count, dtype: int64
                                                     text  \
541200             @chrishasboobs AHHH I HOPE YOUR OK!!!    
750     @misstoriblack cool , i have no tweet apps  fo...   
766711  @TiannaChaos i know  just family drama. its la...   
285

# New section