In [1]:
import pandas as pd
import os
import numpy as np



In [2]:
data_dir = '/Users/anjanapro/Desktop/language_classification/data/indian_songs_dataset'

In [3]:
# initialize an empty list to store each language dataframe
dfs = []

# loop through each file in the directory and read the file into a dataframe
for file in os.listdir(data_dir):
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(data_dir, file))
        dfs.append(df)

# concatenate all the dataframes into a single dataframe
indian_songs_df = pd.concat(dfs, ignore_index=True)



In [4]:
# get an understanding of what attributes are present in the dataframe
indian_songs_df.head()

Unnamed: 0,song_name,singer,singer_id,duration,language,released_date,danceability,acousticness,energy,liveness,loudness,speechiness,tempo,mode,key,Valence,time_signature,popularity,Stream
0,Naguva Nayana,S. Janaki|S. P. Balasubrahmanyam,/artist/s-janaki|/artist/s-p-balasubrahamanyam,04:12,Kannada,09-10-2020,0.53,0.11,0.444,0.8,-7.552,0.093,133.899,0.0,8.0,0.04,4.0,74.0,5316814.0
1,Ellelli Nodali,S. Janaki|Dr. Rajkumar,/artist/s-janaki|/artist/dr-rajkumar-1,04:31,Kannada,07-09-2018,0.66,0.361,0.695,0.511,-13.317,0.0615,130.0,0.0,0.0,0.03,4.0,62.0,4573499.0
2,Chinnada Mallige Hoove,S. Janaki|Dr. Rajkumar,/artist/s-janaki|/artist/dr-rajkumar-1,04:35,Kannada,09-08-2014,0.58,0.42,0.472,0.6,-6.833,0.06,168.01,0.0,4.0,0.0985,4.0,25.0,4709951.0
3,Premalokadinda,K J Yesudas|S. Janaki,/artist/k-j-yesudas|/artist/s-janaki,04:35,Kannada,04-11-2018,0.14,0.037,0.258,0.237,-6.416,0.0595,123.004,0.0,1.0,0.021,4.0,58.0,1388726.0
4,Preethiyalli Iro Sukha,S. P. Balasubrahmanyam|Manjula Gururaj,/artist/s-p-balasubrahamanyam|/artist/manjula-...,04:41,Kannada,03-05-2012,0.8,0.56,0.584,0.8,-3.596,0.001,113.973,1.0,1.0,0.0766,4.0,68.0,1655326.0


In [5]:
# get an understanding of how many null values are present in the dataframe
indian_songs_df.isnull().sum()

song_name         0
singer            0
singer_id         0
duration          0
language          0
released_date     0
danceability      1
acousticness      1
energy            1
liveness          1
loudness          1
speechiness       1
tempo             2
mode              2
key               2
Valence           2
time_signature    2
popularity        2
Stream            2
dtype: int64

In [6]:
# get an understanding of how many different languages are present in the dataframe
indian_songs_df['language'].value_counts()

language
Telugu        4996
Marathi       4699
Tamil         4677
Punjabi       3818
Kannada       3559
Urdu          3116
Old           2448
Gujarati      2115
Hindi         1184
Bengali        958
Odia           940
Assamese       724
Rajasthani     541
Bhojpuri       519
Malayalam      479
Haryanvi       228
Name: count, dtype: int64

In [8]:
# the logic for mapping the full language column to the ISO 639-1 code if available, otherwise keep the full language name
language_code_map = {
    'Assamese': 'as',
    'Bengali': 'bn',
    'Bhojpuri': 'bho',
    'Gujarati': 'gu',
    'Haryanvi': 'haryanvi',
    'Hindi': 'hi',
    'Old': 'hi',
    'Kannada': 'kn',
    'Malayalam': 'ml',
    'Marathi': 'mr',
    'Odia': 'or',
    'Punjabi': 'pa',
    'Rajasthani': 'rajasthani',
    'Tamil': 'ta',
    'Telugu': 'te',
    'Urdu': 'ur'
}

def map_language_to_code(language):
    return language_code_map.get(language, language)

indian_songs_df['language_code'] = indian_songs_df['language'].apply(map_language_to_code)


In [9]:
# drop the columns that are not needed in the final dataframe
columns_to_keep = [
    'song_name', 'singer', 'popularity', 'danceability', 'energy', 'key', 
    'loudness', 'mode', 'speechiness', 'acousticness', 'liveness', 
    'Valence', 'tempo', 'duration', 'language_code'
]

indian_songs_df = indian_songs_df[columns_to_keep]


In [10]:
# since there were so few null values, drop those entire rows
indian_songs_df.dropna(inplace=True)

indian_songs_df.isnull().sum()

song_name        0
singer           0
popularity       0
danceability     0
energy           0
key              0
loudness         0
mode             0
speechiness      0
acousticness     0
liveness         0
Valence          0
tempo            0
duration         0
language_code    0
dtype: int64

In [11]:
global_songs_file = '/Users/anjanapro/Desktop/language_classification/data/spotify_songs.csv'

global_songs_df = pd.read_csv(global_songs_file)



In [12]:
# rename the global dataset columns to match the indian dataset columns
global_songs_df.rename(columns={
    'track_name': 'song_name',
    'track_artist': 'singer',
    'track_popularity': 'popularity',
    'duration_ms': 'duration',
    'valence': 'Valence',
    'language': 'language_code'
}, inplace=True)

In [13]:
global_songs_df = global_songs_df[columns_to_keep]


In [14]:
# combine the indian and global songs datasets into one df
combined_songs_df = pd.concat([indian_songs_df, global_songs_df], ignore_index=True)


In [15]:
combined_songs_df.drop(columns=['duration'], inplace=True)

In [16]:
# standardizing the numerical columns in case a model requires it
from sklearn.preprocessing import StandardScaler

numerical_columns = [
    'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 
    'speechiness', 'acousticness', 'liveness', 'Valence', 'tempo'
]

scaler = StandardScaler()

combined_songs_df[numerical_columns] = scaler.fit_transform(combined_songs_df[numerical_columns])

combined_songs_df[numerical_columns].head()



Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,liveness,Valence,tempo
0,0.736514,-0.083493,-0.450116,0.734904,0.009109,-1.137378,0.337627,-0.905578,1.404685,-0.658868,-0.177735
1,0.253803,0.420039,0.471892,-1.457722,-1.712827,-1.137378,-0.113233,-0.092093,0.410475,-0.696996,-0.314435
2,-1.234557,0.110173,-0.347262,-0.361409,0.223865,-1.137378,-0.134702,0.099125,0.71665,-0.435816,1.018208
3,0.092899,-1.594089,-1.133356,-1.183644,0.348418,-1.137378,-0.141859,-1.142169,-0.532133,-0.731312,-0.559717
4,0.495158,0.962304,0.064152,-1.183644,1.190718,0.879215,-0.979168,0.552862,1.404685,-0.519318,-0.876347


In [17]:
# sentence embedding the song and singer columns
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

combined_songs_df['song_embedding'] = combined_songs_df['song_name'].apply(lambda x: model.encode(x))

combined_songs_df['singer_embedding'] = combined_songs_df['singer'].apply(lambda x: model.encode(x))

combined_songs_df.head()

  from tqdm.autonotebook import tqdm, trange


Unnamed: 0,song_name,singer,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,liveness,Valence,tempo,language_code,song_embedding,singer_embedding
0,Naguva Nayana,S. Janaki|S. P. Balasubrahmanyam,0.736514,-0.083493,-0.450116,0.734904,0.009109,-1.137378,0.337627,-0.905578,1.404685,-0.658868,-0.177735,kn,"[0.008150081, 0.046776354, -0.13777135, 0.2363...","[-0.2541248, 0.038421728, -0.2110088, 0.127446..."
1,Ellelli Nodali,S. Janaki|Dr. Rajkumar,0.253803,0.420039,0.471892,-1.457722,-1.712827,-1.137378,-0.113233,-0.092093,0.410475,-0.696996,-0.314435,kn,"[-0.01831829, -0.32449624, -0.023548305, -0.00...","[-0.1047148, -0.122336745, -0.07683745, 0.0566..."
2,Chinnada Mallige Hoove,S. Janaki|Dr. Rajkumar,-1.234557,0.110173,-0.347262,-0.361409,0.223865,-1.137378,-0.134702,0.099125,0.71665,-0.435816,1.018208,kn,"[0.08396285, 0.103031315, -0.051133372, 0.1788...","[-0.1047148, -0.122336745, -0.07683745, 0.0566..."
3,Premalokadinda,K J Yesudas|S. Janaki,0.092899,-1.594089,-1.133356,-1.183644,0.348418,-1.137378,-0.141859,-1.142169,-0.532133,-0.731312,-0.559717,kn,"[-0.16348617, 0.2271394, -0.022751512, 0.08768...","[-0.30077335, 0.19757098, -0.2238111, 0.001967..."
4,Preethiyalli Iro Sukha,S. P. Balasubrahmanyam|Manjula Gururaj,0.495158,0.962304,0.064152,-1.183644,1.190718,0.879215,-0.979168,0.552862,1.404685,-0.519318,-0.876347,kn,"[-0.017613528, 0.3344294, -0.21310134, 0.21164...","[-0.091006845, 0.033737883, -0.22041616, 0.136..."


In [18]:
# label encode the language code
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

combined_songs_df['language_code_encoded'] = label_encoder.fit_transform(combined_songs_df['language_code'])




In [19]:
# creating a map of the label encoding
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

Label Mapping: {'af': np.int64(0), 'ar': np.int64(1), 'as': np.int64(2), 'bho': np.int64(3), 'bn': np.int64(4), 'ca': np.int64(5), 'cs': np.int64(6), 'cy': np.int64(7), 'da': np.int64(8), 'de': np.int64(9), 'el': np.int64(10), 'en': np.int64(11), 'es': np.int64(12), 'et': np.int64(13), 'fi': np.int64(14), 'fr': np.int64(15), 'gu': np.int64(16), 'haryanvi': np.int64(17), 'hi': np.int64(18), 'hr': np.int64(19), 'hu': np.int64(20), 'id': np.int64(21), 'it': np.int64(22), 'ja': np.int64(23), 'kn': np.int64(24), 'ko': np.int64(25), 'ml': np.int64(26), 'mr': np.int64(27), 'nl': np.int64(28), 'no': np.int64(29), 'or': np.int64(30), 'pa': np.int64(31), 'pl': np.int64(32), 'pt': np.int64(33), 'rajasthani': np.int64(34), 'ro': np.int64(35), 'ru': np.int64(36), 'sk': np.int64(37), 'so': np.int64(38), 'sq': np.int64(39), 'sv': np.int64(40), 'sw': np.int64(41), 'ta': np.int64(42), 'te': np.int64(43), 'tl': np.int64(44), 'tr': np.int64(45), 'ur': np.int64(46), 'vi': np.int64(47), nan: np.int64(48)}


In [60]:
combined_songs_df.to_csv('combined_songs_data.csv', index=False)


In [61]:
label_mapping_df = pd.DataFrame(list(label_mapping.items()), columns=['language_code', 'encoded_value'])

label_mapping_df.to_csv('language_code_mapping.csv', index=False)

In [20]:
min_samples = 30

language_counts = combined_songs_df['language_code_encoded'].value_counts()

languages_to_keep = language_counts[language_counts >= min_samples].index

filtered_combined_songs_df = combined_songs_df[combined_songs_df['language_code_encoded'].isin(languages_to_keep)]

In [21]:
numerical_features = ['popularity', 'danceability', 'energy', 'key', 
                      'loudness', 'speechiness', 'acousticness', 
                      'liveness', 'Valence', 'tempo']

numerical_features_matrix = filtered_combined_songs_df[numerical_features].values

song_name_matrix = np.vstack(filtered_combined_songs_df['song_embedding'].values)
singer_matrix = np.vstack(filtered_combined_songs_df['singer_embedding'].values)

X = np.hstack([numerical_features_matrix, song_name_matrix, singer_matrix])

y = filtered_combined_songs_df['language_code_encoded'].values


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [24]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest with class weights
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)


In [25]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy: {accuracy_rf:.2f}')

print(classification_report(y_test, y_pred_rf))

Accuracy: 0.84
              precision    recall  f1-score   support

           2       0.97      0.78      0.86       145
           3       0.88      0.47      0.61       104
           4       0.82      0.65      0.72       192
           8       1.00      0.38      0.55         8
           9       1.00      0.44      0.61        48
          11       0.89      1.00      0.94      3081
          12       0.97      0.62      0.75       341
          15       1.00      0.23      0.38        13
          16       0.80      0.72      0.76       423
          17       0.81      0.46      0.58        46
          18       0.83      0.89      0.86       728
          21       1.00      0.14      0.25         7
          22       0.67      0.07      0.13        27
          24       0.78      0.70      0.74       712
          26       0.83      0.62      0.71        96
          27       0.79      0.94      0.86       940
          28       1.00      0.29      0.44        14
          30

In [26]:
from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'Accuracy: {accuracy_svm:.2f}')

print(classification_report(y_test, y_pred_svm))

Accuracy: 0.78
              precision    recall  f1-score   support

           2       0.81      0.86      0.83       145
           3       0.43      0.66      0.52       104
           4       0.70      0.78      0.74       192
           8       0.25      0.50      0.33         8
           9       0.20      0.58      0.30        48
          11       0.96      0.87      0.91      3081
          12       0.61      0.78      0.69       341
          15       0.31      0.38      0.34        13
          16       0.62      0.73      0.67       423
          17       0.48      0.48      0.48        46
          18       0.81      0.83      0.82       728
          21       0.29      0.29      0.29         7
          22       0.17      0.26      0.21        27
          24       0.67      0.73      0.70       712
          26       0.43      0.79      0.56        96
          27       0.88      0.84      0.86       940
          28       0.36      0.29      0.32        14
          30

In [28]:
from sklearn.linear_model import LogisticRegression

logistic_clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logistic_clf.fit(X_train, y_train)
y_pred_logistic = logistic_clf.predict(X_test)

accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f'Accuracy: {accuracy_logistic:.2f}')

print(classification_report(y_test, y_pred_logistic))

Accuracy: 0.72
              precision    recall  f1-score   support

           2       0.71      0.83      0.77       145
           3       0.34      0.67      0.45       104
           4       0.56      0.75      0.64       192
           8       0.15      0.50      0.23         8
           9       0.21      0.65      0.32        48
          11       0.97      0.73      0.83      3081
          12       0.56      0.72      0.63       341
          15       0.08      0.31      0.13        13
          16       0.67      0.66      0.67       423
          17       0.29      0.65      0.40        46
          18       0.81      0.77      0.79       728
          21       0.05      0.29      0.09         7
          22       0.09      0.33      0.14        27
          24       0.69      0.68      0.68       712
          26       0.38      0.83      0.52        96
          27       0.85      0.77      0.81       940
          28       0.12      0.36      0.18        14
          30

In [29]:
from sklearn.ensemble import ExtraTreesClassifier

extra_trees_clf = ExtraTreesClassifier(n_estimators=100, class_weight='balanced', random_state=42)
extra_trees_clf.fit(X_train, y_train)
y_pred_extra_trees = extra_trees_clf.predict(X_test)

accuracy_extra_trees = accuracy_score(y_test, y_pred_extra_trees)
print(f'Accuracy: {accuracy_extra_trees:.2f}')

print(classification_report(y_test, y_pred_extra_trees))

Accuracy: 0.84
              precision    recall  f1-score   support

           2       0.98      0.76      0.86       145
           3       0.85      0.45      0.59       104
           4       0.79      0.55      0.65       192
           8       1.00      0.38      0.55         8
           9       1.00      0.46      0.63        48
          11       0.89      1.00      0.94      3081
          12       0.97      0.72      0.83       341
          15       1.00      0.23      0.38        13
          16       0.81      0.72      0.76       423
          17       0.81      0.54      0.65        46
          18       0.80      0.86      0.83       728
          21       1.00      0.14      0.25         7
          22       0.86      0.22      0.35        27
          24       0.76      0.69      0.72       712
          26       0.70      0.57      0.63        96
          27       0.75      0.92      0.83       940
          28       1.00      0.36      0.53        14
          30

In [30]:
import joblib

joblib.dump(rf_clf, 'random_forest_model.joblib')

['random_forest_model.joblib']