In [1]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import pickle

In [9]:
df = pd.read_csv("Hit_1.csv")
df.drop(columns = df.columns[0], inplace = True)

In [11]:
df.head()

Unnamed: 0,SongID,Song,Performer,Lyrics,Cleaned_Lyrics,Hit,Tokens,Flesch_Reading_Ease
0,'65 Love AffairPaul Davis,'65 Love Affair,Paul Davis,I was a car hop\rYou were into be-bop\rYou san...,car hop bebop sang wop diddy wop diddy wop doo...,0,"['car', 'hop', 'bebop', 'sang', 'wop', 'diddy'...",-50.33
1,'Til My Baby Comes HomeLuther Vandross,'Til My Baby Comes Home,Luther Vandross,Theres a whole lot of girls\r\nmessin around\r...,whole lot girls messin around trying get thing...,0,"['whole', 'lot', 'girls', 'messin', 'around', ...",-27.32
2,'Til Summer Comes AroundKeith Urban,'Til Summer Comes Around,Keith Urban,Another long summer's come and gone\r\nI don't...,another long summer 's come gone know always e...,0,"['another', 'long', 'summer', ""'s"", 'come', 'g...",-64.88
3,'Til You Do Me RightAfter 7,'Til You Do Me Right,After 7,I was in love with you\r\nAnd gave my heart to...,love gave heart best keep satisfied took love ...,0,"['love', 'gave', 'heart', 'best', 'keep', 'sat...",-3.3
4,'TilThe Angels,'Til,The Angels,"Due to copyright restrictions, we are not auth...",due copyright restrictions authorized display ...,0,"['due', 'copyright', 'restrictions', 'authoriz...",6.17


In [13]:
df = df[["Cleaned_Lyrics", "Hit"]]

In [15]:
df.head()

Unnamed: 0,Cleaned_Lyrics,Hit
0,car hop bebop sang wop diddy wop diddy wop doo...,0
1,whole lot girls messin around trying get thing...,0
2,another long summer 's come gone know always e...,0
3,love gave heart best keep satisfied took love ...,0
4,due copyright restrictions authorized display ...,0


In [22]:
df.dropna(inplace = True)

In [24]:
df.isna().sum()

Cleaned_Lyrics    0
Hit               0
dtype: int64

# TF-IDF Feature Extraction

In [26]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_features = tfidf_vectorizer.fit_transform(df["Cleaned_Lyrics"])

In [28]:
svd = TruncatedSVD(n_components=300, random_state=42)
tfidf_reduced = svd.fit_transform(tfidf_features)

In [30]:
combined_features = tfidf_reduced

In [32]:
y = df["Hit"]
X_train, X_test, y_train, y_test = train_test_split(
    combined_features, y, test_size=0.2, random_state=42, stratify=y
)

# Train a Random Forest Classifie

In [37]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2505
           1       0.00      0.00      0.00       253

    accuracy                           0.91      2758
   macro avg       0.45      0.50      0.48      2758
weighted avg       0.82      0.91      0.86      2758



In [41]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight="balanced"),
                           param_grid, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)


Best parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}


In [43]:
y_pred_tuned = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred_tuned))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2505
           1       0.00      0.00      0.00       253

    accuracy                           0.91      2758
   macro avg       0.45      0.50      0.48      2758
weighted avg       0.82      0.91      0.86      2758



# Reccurent Network

In [46]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

2025-03-16 02:30:36.761968: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [49]:
texts = df["Cleaned_Lyrics"].tolist()
labels = df["Hit"].tolist()

In [51]:
max_words = 10000  # vocabulary size
max_len = 300      # maximum sequence length
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

In [53]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42, stratify=labels)

In [57]:
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")
])



In [59]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [61]:
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train, np.array(y_train), epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stop])

Epoch 1/10
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 254ms/step - accuracy: 0.8986 - loss: 0.3416 - val_accuracy: 0.9035 - val_loss: 0.3183
Epoch 2/10
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 292ms/step - accuracy: 0.9028 - loss: 0.3235 - val_accuracy: 0.9035 - val_loss: 0.3183
Epoch 3/10
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 262ms/step - accuracy: 0.9128 - loss: 0.3006 - val_accuracy: 0.9035 - val_loss: 0.3190
Epoch 4/10
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 283ms/step - accuracy: 0.9094 - loss: 0.3077 - val_accuracy: 0.9035 - val_loss: 0.3231
Epoch 5/10
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 305ms/step - accuracy: 0.9098 - loss: 0.3040 - val_accuracy: 0.9035 - val_loss: 0.3187


<keras.src.callbacks.history.History at 0x14f167500>

In [63]:
loss, accuracy = model.evaluate(X_test, np.array(y_test))
print(f"Test Accuracy (CNN): {accuracy:.4f}")

[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 73ms/step - accuracy: 0.9055 - loss: 0.3140
Test Accuracy (CNN): 0.9083


In [65]:
y_prob = model.predict(X_test)

# Convert probabilities to class labels (assuming binary classification with threshold 0.5)
y_pred = (y_prob > 0.5).astype(int)


print(classification_report(y_test, y_pred))

[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 74ms/step
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2505
           1       0.00      0.00      0.00       253

    accuracy                           0.91      2758
   macro avg       0.45      0.50      0.48      2758
weighted avg       0.82      0.91      0.86      2758



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
