In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
import joblib
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AKS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AKS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\AKS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
common_words = ['music', 'song', 'night', 'time', 'feel', 'world', 'life', 'sound', 
                'heart', 'love', 'soul', 'rhythm', 'energy', 'feel', 'move']

In [6]:
genre_keywords = {
    'Rock': [
        ['guitar', 'solo', 'electric', 'riff', 'power', 'loud', 'drums', 'amplifier', 'distortion', 'music'],
        ['rebel', 'freedom', 'highway', 'motorcycle', 'wild', 'stage', 'concert', 'band', 'night'],
        ['metal', 'thunder', 'lightning', 'headbang', 'mosh', 'energy', 'raw', 'intense', 'sound'],
        ['punk', 'attitude', 'rebellion', 'angry', 'youth', 'street', 'alternative', 'indie', 'life'],
        ['melody', 'atmospheric', 'emotional', 'grunge', 'anthem', 'chorus', 'feel', 'soul'],
        ['classic', 'legendary', 'iconic', 'arena', 'stadium', 'epic', 'powerful', 'driving', 'world'],
        ['bass', 'feedback', 'overdrive', 'jam', 'live', 'performance', 'crowd', 'energy']
    ],
    'Pop': [
        ['love', 'heart', 'dancing', 'party', 'celebration', 'fun', 'happiness', 'smile', 'joy', 'music'],
        ['baby', 'tonight', 'dance', 'floor', 'club', 'lights', 'romance', 'forever', 'feel'],
        ['summer', 'sunshine', 'beautiful', 'feeling', 'amazing', 'wonderful', 'dream', 'perfect', 'world'],
        ['sweet', 'melody', 'catchy', 'chorus', 'radio', 'friendly', 'star', 'glamour', 'song'],
        ['upbeat', 'positive', 'energy', 'happy', 'vibes', 'move', 'groove', 'shine', 'rhythm'],
        ['young', 'free', 'alive', 'moment', 'tonight', 'forever', 'together', 'feel', 'time'],
        ['bright', 'colorful', 'sparkle', 'magic', 'special', 'touch', 'kiss', 'dance', 'night']
    ],
    'Hip-Hop': [
        ['rap', 'flow', 'lyrical', 'rhyme', 'beat', 'street', 'hustle', 'grind', 'money', 'life'],
        ['culture', 'urban', 'city', 'lifestyle', 'swagger', 'confident', 'boss', 'world'],
        ['microphone', 'stage', 'performance', 'crowd', 'energy', 'hype', 'party', 'turn', 'night'],
        ['bars', 'verses', 'punch', 'lines', 'wordplay', 'metaphor', 'rhythm', 'poetry', 'soul'],
        ['trap', 'bass', 'heavy', 'bounce', 'club', 'banger', 'lit', 'fire', 'music'],
        ['freestyle', 'cipher', 'underground', 'authentic', 'real', 'truth', 'story', 'life', 'feel'],
        ['boom', 'sample', 'scratch', 'turntable', 'vinyl', 'classic', 'sound']
    ],
    'Country': [
        ['road', 'truck', 'farm', 'hometown', 'small', 'town', 'rural', 'nostalgia', 'life'],
        ['cowboy', 'boots', 'hat', 'ranch', 'horse', 'sunset', 'prairie', 'dust', 'trail', 'night'],
        ['whiskey', 'bar', 'heartbreak', 'pickup', 'dirt', 'memories', 'simple', 'honest', 'love'],
        ['southern', 'comfort', 'guitar', 'storytelling', 'values', 'home', 'heart', 'soul'],
        ['fiddle', 'banjo', 'bluegrass', 'mountain', 'folk', 'tradition', 'heritage', 'roots', 'music'],
        ['field', 'cornfield', 'barn', 'tractor', 'sunrise', 'porch', 'front', 'swing', 'time'],
        ['beer', 'cold', 'tailgate', 'bonfire', 'stars', 'night', 'sky', 'feel', 'world']
    ],
    'Electronic': [
        ['synthesizer', 'techno', 'beat', 'drop', 'bass', 'edm', 'festival', 'rave', 'music'],
        ['house', 'dance', 'club', 'dj', 'turntable', 'mixing', 'vinyl', 'production', 'night'],
        ['ambient', 'soundscape', 'atmospheric', 'chill', 'relax', 'meditation', 'peaceful', 'calm', 'sound'],
        ['dubstep', 'wobble', 'bass', 'heavy', 'drop', 'glitch', 'digital', 'synthetic', 'energy'],
        ['trance', 'euphoric', 'uplifting', 'progressive', 'journey', 'melodic', 'energy', 'feel'],
        ['drum', 'machine', 'sequencer', 'loop', 'sample', 'midi', 'digital', 'virtual', 'rhythm'],
        ['laser', 'lights', 'neon', 'electric', 'pulse', 'wave', 'frequency', 'vibration', 'sound', 'world']
    ]
}

In [7]:
def generate_lyrics(genre, num_samples=100):
    lyrics_list = []
    keywords_pool = genre_keywords[genre]
    
    for _ in range(num_samples):
        num_sets = random.randint(3, 5)
        selected_sets = random.sample(keywords_pool, min(num_sets, len(keywords_pool)))
        
        words = []
        for keyword_set in selected_sets:
            num_words = random.randint(3, 6)
            words.extend(random.sample(keyword_set, min(num_words, len(keyword_set))))
        
        
        if random.random() < 0.4:
            num_common = random.randint(3, 6) 
            words.extend(random.sample(common_words, num_common))
        
       
        if random.random() < 0.3:
            other_genres = [g for g in genre_keywords.keys() if g != genre]
           
            num_genres_to_mix = 1 if random.random() < 0.7 else 2
            for _ in range(num_genres_to_mix):
                random_genre = random.choice(other_genres)
                random_set = random.choice(genre_keywords[random_genre])
                num_words = random.randint(2, 4)  
                words.extend(random.sample(random_set, min(num_words, len(random_set))))
        
       
        length_variation = random.random()
        if length_variation < 0.25:  
            words = words[:random.randint(4, 8)]
        elif length_variation > 0.85:  # 
          
            extra_set = random.choice(keywords_pool)
            words.extend(random.sample(extra_set, random.randint(3, 5)))
        
        
        if random.random() < 0.2 and len(words) > 5:
            word_to_duplicate = random.choice(words)
            words.append(word_to_duplicate)
        
       
        random.shuffle(words)
        lyrics = ' '.join(words)
        lyrics_list.append(lyrics)
    
    return lyrics_list

In [8]:
all_lyrics = []
all_genres = []

for genre in genre_keywords.keys():
    lyrics = generate_lyrics(genre, num_samples=160)  
    all_lyrics.extend(lyrics)
    all_genres.extend([genre] * 160)


print("\nAdding hybrid genre samples to increase difficulty...")
num_hybrids = int(len(all_lyrics) * 0.1)

for _ in range(num_hybrids):
   
    genre1, genre2 = random.sample(list(genre_keywords.keys()), 2)
    
    words = []
    
    for genre in [genre1, genre2]:
        keyword_set = random.choice(genre_keywords[genre])
        words.extend(random.sample(keyword_set, random.randint(4, 7)))
    
  
    words.extend(random.sample(common_words, random.randint(2, 4)))
    
    random.shuffle(words)
    hybrid_lyrics = ' '.join(words)
    
   
    hybrid_genre = random.choice([genre1, genre2])
    
    all_lyrics.append(hybrid_lyrics)
    all_genres.append(hybrid_genre)

print(f"   Added {num_hybrids} hybrid samples")


Adding hybrid genre samples to increase difficulty...
   Added 80 hybrid samples


In [9]:
df = pd.DataFrame({
    'lyrics': all_lyrics,
    'genre': all_genres
})

In [10]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Dataset shape: {df.shape}")
print(f"\nGenre distribution:\n{df['genre'].value_counts().sort_index()}")
print(f"\nSample lyrics:\n{df.head(3)}")

Dataset shape: (880, 2)

Genre distribution:
genre
Country       174
Electronic    185
Hip-Hop       174
Pop           175
Rock          172
Name: count, dtype: int64

Sample lyrics:
                                              lyrics       genre
0  city freestyle truth feel turn underground sta...     Hip-Hop
1  summer melody upbeat dance touch bright night ...         Pop
2  chill machine rhythm loop meditation calm peac...  Electronic


In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(text.split())
    
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    
    return ' '.join(words)

In [12]:
df['processed_lyrics'] = df['lyrics'].apply(preprocess_text)
print("PREPROCESSING COMPLETE")
print(f"\nSample processed lyrics:")
print(df[['lyrics', 'processed_lyrics']].head(2))

PREPROCESSING COMPLETE

Sample processed lyrics:
                                              lyrics  \
0  city freestyle truth feel turn underground sta...   
1  summer melody upbeat dance touch bright night ...   

                                    processed_lyrics  
0  city freestyle truth feel turn underground sta...  
1  summer melody upbeat dance touch bright night ...  


In [13]:

X_train, X_test, y_train, y_test = train_test_split(
    df['processed_lyrics'], 
    df['genre'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['genre']
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


Training set size: 704
Test set size: 176


In [14]:
tfidf = TfidfVectorizer(
    max_features=300, 
    ngram_range=(1, 2),  
    min_df=3,  
    max_df=0.7,  
    sublinear_tf=True  
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"\nTF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")


TF-IDF matrix shape: (704, 300)
Number of features: 300


In [15]:

print("TRAINING MODEL...")
rf_model = RandomForestClassifier(
    n_estimators=50,       
    max_depth=8,          
    min_samples_split=15,  
    min_samples_leaf=5,   
    max_features='sqrt',   
    max_samples=0.8,      
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_tfidf, y_train)
print("\n Model training completed!")

TRAINING MODEL...

 Model training completed!


In [16]:
print("\nRunning 5-Fold Cross-Validation...")
cv_scores = cross_val_score(rf_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f"   CV Accuracy: {cv_scores.mean():.2%} (+/- {cv_scores.std() * 2:.2%})")
print(f"   Individual folds: {[f'{score:.2%}' for score in cv_scores]}")

if cv_scores.std() > 0.05:
    print("High variance detected - model may be overfitting!")
else:
    print("Low variance - model is generalizing well!")



Running 5-Fold Cross-Validation...
   CV Accuracy: 94.03% (+/- 3.69%)
   Individual folds: ['95.74%', '95.74%', '92.20%', '95.04%', '91.43%']
Low variance - model is generalizing well!


In [17]:
y_pred = rf_model.predict(X_test_tfidf)
y_pred_proba = rf_model.predict_proba(X_test_tfidf)

print("MODEL PERFORMANCE")
train_accuracy = rf_model.score(X_train_tfidf, y_train)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"\nTraining Accuracy: {train_accuracy:.2%}")
print(f" Test Accuracy:     {test_accuracy:.2%}")

# Check for overfitting
accuracy_gap = train_accuracy - test_accuracy
if accuracy_gap > 0.10:
    print(f"  OVERFITTING DETECTED! Gap: {accuracy_gap:.2%}")
    print("   Model performs much better on training data than test data.")
elif accuracy_gap > 0.05:
    print(f" Slight overfitting. Gap: {accuracy_gap:.2%}")
else:
    print(f"Good generalization Gap: {accuracy_gap:.2%}")

print(f"\n Classification Report:")
print(classification_report(y_test, y_pred))

MODEL PERFORMANCE

Training Accuracy: 95.74%
 Test Accuracy:     93.18%
Good generalization Gap: 2.56%

 Classification Report:
              precision    recall  f1-score   support

     Country       0.89      0.94      0.92        35
  Electronic       0.95      0.95      0.95        37
     Hip-Hop       0.89      0.89      0.89        35
         Pop       0.97      0.94      0.96        35
        Rock       0.97      0.94      0.96        34

    accuracy                           0.93       176
   macro avg       0.93      0.93      0.93       176
weighted avg       0.93      0.93      0.93       176



In [18]:
print("\nðŸ“ˆ Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred, labels=rf_model.classes_)
cm_df = pd.DataFrame(cm, index=rf_model.classes_, columns=rf_model.classes_)
print(cm_df)

feature_names = tfidf.get_feature_names_out()
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1][:15]

print(f"\nTop 15 Most Important Features:")
for i, idx in enumerate(indices, 1):
    print(f"{i:2d}. {feature_names[idx]:15s} â†’ {importances[idx]:.4f}")

print(f"\nPer-Genre Accuracy:")
for genre in rf_model.classes_:
    genre_mask = y_test == genre
    genre_accuracy = accuracy_score(y_test[genre_mask], y_pred[genre_mask])
    print(f"  {genre:15s}: {genre_accuracy:.2%}")


ðŸ“ˆ Confusion Matrix:
            Country  Electronic  Hip-Hop  Pop  Rock
Country          33           0        2    0     0
Electronic        1          35        1    0     0
Hip-Hop           2           0       31    1     1
Pop               0           2        0   33     0
Rock              1           0        1    0    32

Top 15 Most Important Features:
 1. drop            â†’ 0.0514
 2. tonight         â†’ 0.0382
 3. amazing         â†’ 0.0316
 4. overdrive       â†’ 0.0268
 5. dj              â†’ 0.0253
 6. grunge          â†’ 0.0230
 7. progressive     â†’ 0.0192
 8. beautiful       â†’ 0.0189
 9. loop            â†’ 0.0169
10. jam             â†’ 0.0160
11. melodic         â†’ 0.0157
12. confident       â†’ 0.0155
13. scratch         â†’ 0.0146
14. field           â†’ 0.0142
15. midi            â†’ 0.0139

Per-Genre Accuracy:
  Country        : 94.29%
  Electronic     : 94.59%
  Hip-Hop        : 88.57%
  Pop            : 94.29%
  Rock           : 94.12%


In [20]:
joblib.dump(rf_model, 'classifier_model.pkl')
joblib.dump(tfidf, 'vectorizer.pkl')
print("\nModel and vectorizer saved successfully!")
print("classifier_model.pkl")
print("vectorizer.pkl")


Model and vectorizer saved successfully!
classifier_model.pkl
vectorizer.pkl
