<a href="https://colab.research.google.com/github/adityar2309/ADITYAR-MOVIE-SENTIMENT-SCIKITLEARN/blob/main/ADITYA_Movie_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'sentiment-prediction-on-movie-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F53569%2F5834979%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240311%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240311T065453Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D77cefae049a874010b56496b986039e3b4b2ba6e03f2288ef20460ea4687eea998cf74b08cae0ef10550c7488272bd4a9d9e4149b84850654d48590fd5e08ceda5b89df2e767443089dfa5637814c7711057c06c265609ae8e678eea97f2005fe817866bcaf90304054c72c580c739de8475d01f85512e4e833567c3b3f5421305e74e45f8e92a845c0cb49f018d3e0d5d6078a121246d8dd2c665ea741885c5bcd74b12981fe1ea36d6a4a72193cdc316db245543e7663089eed4344c116d2f3b73754e733b55a50483848a3f7d19b62817a9074d999d3f48da26ceab5d58e29eb8f234704df6a9ce64e0c8daabd191b2e0082c8e3e81e96fe28898087a7416'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import  StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [None]:
# Load dataset
train_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv')
test_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv')
movie_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv')


In [None]:
# Remove duplicate movie entries
movie_data = movie_data.drop_duplicates(subset=['movieid'], keep='first')
# Merge train and movie datasets
train_merged = train_data.merge(movie_data,on="movieid",how='left')
test_merged = test_data.merge(movie_data,on="movieid",how='left')

In [None]:
# Display summary statistics of the merged dataset
print(train_merged.describe())

# Percentage of null values in each column
print((train_merged.isnull().sum() / len(train_data)) * 100)

# Plot histogram of audience scores
plt.hist(train_merged['audienceScore'])
plt.xlabel('Audience Score')
plt.ylabel('Frequency')
plt.title('Distribution of Audience Scores')
plt.show()



<p> <h3>Summary Statistics: </h3>
        The audienceScore column has an average (mean) audience score of around 65, with a standard deviation of about 19.9.
        The runtimeMinutes column has an average runtime of approximately 107 minutes, with a standard deviation of around 22.1.
        The audienceScore ranges from 0 to 100, with quartiles at 51, 68, and 82.
        The runtimeMinutes values vary from 4 to 561 minutes.
    </p>
<p>
<h3> Null Values: </h3>
        The percentage of missing values in each column is shown. For example, reviewText has around 3.96% missing values, and rating and ratingContents have about 39.15% missing values.
    </p>

In [None]:
# Split the data into features and target
y_train = train_merged['sentiment']
X_train = train_merged.drop(columns = ['movieid', 'isFrequentReviewer',
       'releaseDateTheaters', 'releaseDateStreaming', 'runtimeMinutes', 'boxOffice',
       'soundType','sentiment', 'ratingContents'])# 'originalLanguage' ,'title','genre','reviewerName','director','distributor', 'rating'])
X_test = test_merged.drop(columns = ['movieid', 'isTopCritic',
       'releaseDateTheaters', 'releaseDateStreaming', 'runtimeMinutes', 'boxOffice',
       'soundType', 'ratingContents'])#'originalLanguage','title','genre','reviewerName','director','distributor', 'rating',])



In [None]:
# Define numeric and categorical features
numeric_features = ['audienceScore']
categorical_features = ['reviewText', 'title', 'rating','genre', 'reviewerName', 'originalLanguage', 'distributor', 'director']

# Impute missing values in categorical features
imputer = SimpleImputer(strategy='most_frequent')
for feature in categorical_features:
    X_train[feature] = imputer.fit_transform(X_train[[feature]])
    X_test[feature] = imputer.transform(X_test[[feature]])

# imputer = SimpleImputer(strategy='mean')
# X_train['audienceScore'] = imputer.fit_transform(X_train[['audienceScore']])
# X_test['audienceScore'] = imputer.transform(X_test[['audienceScore']])

# # X_train[['reviewText']]=X_train[['reviewText']].fillna("")
# # X_train[['title']]=X_train[['title']].fillna("")
# # X_train[['genre']]=X_train[['genre']].fillna("")
# # X_train[['reviewerName']] = X_train[['reviewerName']].fillna("")
# # X_train['reviewerName']] = X_train[['originalLanguage']].fillna("")
# # X_train[['director']] =X_train[['director']].fillna("")
# # X_train[['distributor']] =X_train[['distributor']].fillna("")
# # X_test[['reviewText']]=X_test[['reviewText']].fillna("")
# # X_test[['title']]=X_test[['title']].fillna("")
# # X_test[['genre']]=X_test[['genre']].fillna("")
# # X_test[['reviewerName']] = X_test[['reviewerName']].fillna("")
# # X_test[['reviewerName']] = X_test[['originalLanguage']].fillna("")
# # X_test[['director']] =X_test[['director']].fillna("")
# # X_test[['distributor']] =X_test[['distributor']].fillna("")
# scaler = StandardScaler()
# X_train['audienceScore'] = scaler.fit_transform(X_train[['audienceScore']])
# X_test['audienceScore'] = scaler.transform(X_test[['audienceScore']])

# tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['reviewText'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['reviewText'])

# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['ratingContents'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['ratingContents'])

# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['title'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['title'])

# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['genre'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['genre'])

# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['reviewerName'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['reviewerName'])


# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['originalLanguage'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['originalLanguage'])

# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['distributor'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['distributor'])


# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['director'])
# X_test_tfidf = tfidf_vectorizer.transform(X_test['director'])


# Define transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('tfidf',TfidfVectorizer())])

In [None]:
# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat0', categorical_transformer, 'reviewText'),
        ('cat1', categorical_transformer, 'title'),
        ('cat2', categorical_transformer, 'genre'),
        ('cat3', categorical_transformer, 'reviewerName'),
        ('cat4', categorical_transformer, 'originalLanguage'),
        ('cat5', categorical_transformer, 'distributor'),
        ('cat6', categorical_transformer, 'director')

    ])


In [None]:
# Define models and hyperparameter distributions for RandomizedSearchCV
logreg_model = LogisticRegression()
logreg_param_distributions = {
    'C': [1.5, 2, 3],
    'solver': ['saga'],
    'max_iter': [1000]
}

svc_model = LinearSVC(max_iter=1000)
svc_param_distributions = {
    'C': [0.3, 0.4, 0.5, 1.2],
    'tol': [0.001, 0.0001],
    'max_iter': [1000]
}

sgd_logreg_model = SGDClassifier(loss='log_loss')
param_distributions = {
    'alpha': [1, 1.2],
    'max_iter': [1000]
}

In [None]:

# logreg_model = LogisticRegression()
# logreg_param_distributions = {
#     'C': [1.5,2,3],
#     'solver':['saga'],
#     'max_iter':[1000]
# }

# svc_model = LinearSVC(max_iter=1000)
# svc_param_distributions = {
#     'C': [0.3,0.4,0.5,1.2],
#     'tol':[0.001,0.0001],
#     'max_iter':[1000]
# }

# sgd_logreg_model = SGDClassifier(loss='log')
# param_distributions = {
#     'alpha': [1,1.2],
#     'max_iter': [1000]
# }

# logreg_random_search = RandomizedSearchCV(logreg_model, logreg_param_distributions, n_iter=3, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
# logreg_random_search.fit(X_train_tfidf, y_train)

# svc_random_search = RandomizedSearchCV(svc_model, svc_param_distributions, n_iter=8, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
# svc_random_search.fit(X_train_tfidf, y_train)

# sgd_random_search = RandomizedSearchCV(sgd_logreg_model, param_distributions, n_iter=3, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
# sgd_random_search.fit(X_train_tfidf, y_train)

# print(logreg_random_search.best_params_)
# print(svc_random_search.best_params_)
# print(sgd_random_search.best_params_)


In [None]:
# Build pipelines for each model
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomizedSearchCV(logreg_model, logreg_param_distributions, n_iter=2, cv=3, scoring='accuracy', verbose=1, n_jobs=-1))
])

svc_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomizedSearchCV(svc_model, svc_param_distributions, n_iter=4, cv=3, scoring='accuracy', verbose=1, n_jobs=-1))
])

sgd_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomizedSearchCV(sgd_logreg_model, param_distributions, n_iter=2, cv=3, scoring='accuracy', verbose=1, n_jobs=-1))
])

In [None]:
# Fit pipelines on training data
logreg_pipeline.fit(X_train, y_train)
svc_pipeline.fit(X_train, y_train)
sgd_pipeline.fit(X_train, y_train)

# Fit pipelines on training data
print(logreg_pipeline.named_steps['classifier'].best_params_)
print(svc_pipeline.named_steps['classifier'].best_params_)
print(sgd_pipeline.named_steps['classifier'].best_params_)


<h3>Insights from Model Pipelines and Hyperparameter Tuning
</h3>
Three pipelines were built to train different models, namely Logistic Regression (LogReg), Support Vector Classifier (SVC), and Stochastic Gradient Descent (SGD). These pipelines consisted of two key stages:

<h3>Preprocessing:</h3>
        A preprocessing step was applied to the  data using the preprocessor component.
        This preprocessor included tasks such as data scaling, encoding categorical features, and handling missing values to prepare the data for modeling.

<h3>Model Training with Hyperparameter Tuning:</h3>
        Each pipeline incorporated a classifier component along with hyperparameter tuning using RandomizedSearchCV.
        RandomizedSearchCV randomly samples a specified number of hyperparameter combinations from the given parameter distributions and performs cross-validation to find the best combination that maximizes accuracy.
        For the LogReg model, 3 iterations were performed with 3-fold cross-validation and the best parameters were selected based on accuracy.
        For the SVC model, 8 iterations were performed with 3-fold cross-validation and the best parameters were chosen for accuracy.
        The SGD model underwent 3 iterations with 3-fold cross-validation to identify the best parameters for accuracy.

After fitting the pipelines to the training data, the best hyperparameters were extracted from each model's classifier component:

<h3>Best hyperparameters for LogReg:</h3>
        'solver': 'saga'
        'max_iter': 1000
        'C': 2

   <h3>Best hyperparameters for SVC:</h3>
        'tol': 0.001
        'max_iter': 1000
        'C': 0.3

   <h3>Best hyperparameters for SGD:</h3>
        'max_iter': 1000
        'alpha': 1

These insights provide valuable information about the selected hyperparameters for each model, allowing for better understanding and reproducibility of the model training process.

In [None]:
# Predict sentiment labels on training data
y_pred_logreg = logreg_pipeline.predict(X_train)
y_pred_svc = svc_pipeline.predict(X_train)
y_pred_sgd = sgd_pipeline.predict(X_train)

In [None]:
# Calculate and print accuracies on training data
accuracy_logreg = accuracy_score(y_train, y_pred_logreg)
accuracy_svc = accuracy_score(y_train, y_pred_svc)
accuracy_sgd = accuracy_score(y_train, y_pred_sgd)

In [None]:
print(accuracy_logreg)
print(accuracy_svc)
print(accuracy_sgd)

In [None]:

# y_pred_logreg = logreg_random_search.predict(X_train_tfidf)
# y_pred_svc = svc_random_search.predict(X_train_tfidf)
# y_pred_sgd = sgd_random_search.predict(X_train_tfidf)


# # Calculate accuracies
# accuracy_logreg = accuracy_score(y_train, y_pred_logreg)
# accuracy_svc = accuracy_score(y_train, y_pred_svc)
# accuracy_sgd = accuracy_score(y_train, y_pred_sgd)


# # Print accuracies
# print(accuracy_logreg)
# print(accuracy_svc)
# print(accuracy_sgd)


In [None]:
# Predict sentiment labels on test data
y_pred_logreg_test = logreg_pipeline.predict(X_test)
y_pred_svc_test = svc_pipeline.predict(X_test)
y_pred_sgd_test = sgd_pipeline.predict(X_test)

In [None]:
# Create a submission DataFrame and save to CSV
submission = pd.DataFrame({ 'sentiment': y_pred_svc_test})
submission.to_csv('submission.csv', index_label='id')