<a href="https://colab.research.google.com/github/abby306/Codsoft_ML/blob/main/Codsoft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Movie Genre Classifer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

# Step 1: Read and preprocess data
def read_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4:  # Training data format
                data.append((parts[3], parts[2]))
    return data

train_data = read_data('movie_train.txt')
test_data = read_data('movie_test.txt')

X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

# Step 2: Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')

# Step 3: Choose a classifier
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=100),
    'SVM': SVC()
}

# Step 4 and 5: Train and evaluate
for name, classifier in classifiers.items():
    print(f"Training and evaluating {name} classifier...")
    pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('clf', classifier)
    ])

    # Splitting the training data for evaluation
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    pipeline.fit(X_train_split, y_train_split)
    y_pred = pipeline.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    print(f"Validation Accuracy for {name}: {accuracy:.2f}")

    # Retraining on the full training set
    pipeline.fit(X_train, y_train)

    # Predicting on the test data
    y_test_pred = pipeline.predict(X_test)

    # Evaluating on test data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test Accuracy for {name}: {test_accuracy:.2f}")

    # Classification report
    report = classification_report(y_test, y_test_pred)
    print(f"Classification Report for {name}:\n{report}\n")

    # Saving predictions to a file
    with open(f'{name}_predictions.txt', 'w', encoding='utf-8') as file:
        for prediction in y_test_pred:
            file.write(f"{prediction}\n")


Training and evaluating Naive Bayes classifier...
Validation Accuracy for Naive Bayes: 0.46
Test Accuracy for Naive Bayes: 0.46
Classification Report for Naive Bayes:
              precision    recall  f1-score   support

      action       0.00      0.00      0.00       157
       adult       0.00      0.00      0.00        73
   adventure       0.00      0.00      0.00        81
   animation       0.00      0.00      0.00        82
   biography       0.00      0.00      0.00        34
      comedy       0.65      0.14      0.23       855
       crime       0.00      0.00      0.00        44
 documentary       0.53      0.88      0.66      1568
       drama       0.40      0.87      0.54      1650
      family       0.00      0.00      0.00        91
     fantasy       0.00      0.00      0.00        28
   game-show       0.00      0.00      0.00        20
     history       0.00      0.00      0.00        32
      horror       0.00      0.00      0.00       253
       music       0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy for Logistic Regression: 0.50


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Accuracy for Logistic Regression: 0.52
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

      action       1.00      0.03      0.05       157
       adult       0.00      0.00      0.00        73
   adventure       0.00      0.00      0.00        81
   animation       0.00      0.00      0.00        82
   biography       0.00      0.00      0.00        34
      comedy       0.47      0.45      0.46       855
       crime       0.00      0.00      0.00        44
 documentary       0.57      0.87      0.69      1568
       drama       0.47      0.81      0.60      1650
      family       0.00      0.00      0.00        91
     fantasy       0.00      0.00      0.00        28
   game-show       1.00      0.05      0.10        20
     history       0.00      0.00      0.00        32
      horror       0.73      0.26      0.38       253
       music       1.00      0.07      0.13        88
     musical       0.00      0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy for SVM: 0.49
Test Accuracy for SVM: 0.50
Classification Report for SVM:
              precision    recall  f1-score   support

      action       1.00      0.01      0.01       157
       adult       0.00      0.00      0.00        73
   adventure       0.00      0.00      0.00        81
   animation       0.00      0.00      0.00        82
   biography       0.00      0.00      0.00        34
      comedy       0.55      0.34      0.42       855
       crime       0.00      0.00      0.00        44
 documentary       0.56      0.87      0.68      1568
       drama       0.43      0.86      0.57      1650
      family       0.00      0.00      0.00        91
     fantasy       0.00      0.00      0.00        28
   game-show       1.00      0.20      0.33        20
     history       0.00      0.00      0.00        32
      horror       0.76      0.13      0.22       253
       music       0.00      0.00      0.00        88
     musical       0.00      0.00      0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Spam Detection

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

data = pd.read_csv('spam.csv',encoding='latin-1')
data['v2'] = data['v2'].str.lower()
data['v2'] = data['v2'].str.replace('[^\w\s]', '')

X = data['v2']
y = data['v1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
nb_pred = nb_classifier.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))
print(classification_report(y_test, nb_pred))

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)
lr_pred = lr_classifier.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)
svm_pred = svm_classifier.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  data['v2'] = data['v2'].str.replace('[^\w\s]', '')


Naive Bayes Accuracy: 0.967713004484305
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Logistic Regression Accuracy: 0.9515695067264573
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.96      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115

SVM Accuracy: 0.9757847533632287
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       0.98      0.83      0.90       150

    accuracy                           0.98      1115
   mac

Customer Churn

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_csv('Churn_Modelling.csv')
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

X = data.drop(columns=['Exited'])
y = data['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_scaled, y_train)
lr_pred = lr_classifier.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_scaled, y_train)
rf_pred = rf_classifier.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train_scaled, y_train)
gb_pred = gb_classifier.predict(X_test_scaled)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_pred))
print(classification_report(y_test, gb_pred))




Logistic Regression Accuracy: 0.815
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1607
           1       0.60      0.18      0.28       393

    accuracy                           0.81      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000

Random Forest Accuracy: 0.8645
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.46      0.57       393

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.86      0.86      0.85      2000

Gradient Boosting Accuracy: 0.865
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.58       393

    accuracy                           0.86      2000
   macro avg       0.82    