In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import glob
import os

# Folder containing your MFCC files
folder_path = '/home/avdhoot/third_sem/ds203/project/MFCC-files-v2'  # Update with your path

# Feature extraction for all 20 MFCCs + additional statistical features
features_list = []

for file in range(1, 117):
    # Naming format for MFCC files (like "1.csv", "2.csv", ...)
    filename = os.path.join(folder_path, f"{file}.csv")
    
    # Load MFCCs from CSV
    mfcc_df = pd.read_csv(filename, header=None)
    mfcc_df = mfcc_df.transpose()
    
    # Extract the first 20 MFCCs
    mfcc_20 = mfcc_df.values[:, :20]
    
    # Calculate statistical features for each of the first 5 MFCCs
    file_features = []
    for i in range(5):
        coeff = mfcc_20[:, i]
        file_features.extend([
            np.mean(coeff),           # Mean
            np.var(coeff),            # Variance
            pd.Series(coeff).skew(),  # Skewness
            pd.Series(coeff).kurt(),  # Kurtosis
            np.sqrt(np.mean(coeff**2)),  # Root Mean Square
            np.ptp(coeff),            # Peak-to-peak range
            np.std(coeff)             # Standard deviation
        ])
    
    # Append the features for this file to the list
    features_list.append(file_features)

# Convert the list of features to a DataFrame
columns = []
for i in range(1, 6):
    columns += [f'mean_mfcc{i}', f'var_mfcc{i}', f'skew_mfcc{i}', f'kurt_mfcc{i}', 
                f'rms_mfcc{i}', f'ptp_mfcc{i}', f'std_mfcc{i}']
features_df = pd.DataFrame(features_list, columns=columns)

# Normalize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_df)

# Perform PCA
pca = PCA(n_components=10)
pca_features = pca.fit_transform(scaled_features)

# Apply different clustering algorithms
cluster_results = {}

# 1. KMeans Clustering
kmeans = KMeans(n_clusters=6, random_state=42)
kmeans_labels = kmeans.fit_predict(pca_features)
cluster_results['KMeans'] = kmeans_labels

# 2. DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(pca_features)
cluster_results['DBSCAN'] = dbscan_labels

# 3. Agglomerative Clustering
agglo = AgglomerativeClustering(n_clusters=6)
agglo_labels = agglo.fit_predict(pca_features)
cluster_results['Agglomerative'] = agglo_labels

# Prepare data for saving, with file numbers and their cluster assignments
cluster_data = pd.DataFrame({
    'File_Number': list(range(1, 117)),
    'KMeans_Cluster': cluster_results['KMeans'],
    'DBSCAN_Cluster': cluster_results['DBSCAN'],
    'Agglomerative_Cluster': cluster_results['Agglomerative']
})

# Save to a CSV for easy review
# cluster_data.to_csv('clustering_results_with_pca.csv', index=False)

print(cluster_data.head())  # Check the first few entries of the clustering results


   File_Number  KMeans_Cluster  DBSCAN_Cluster  Agglomerative_Cluster
0            1               1              -1                      2
1            2               1              -1                      2
2            3               0              -1                      1
3            4               2              -1                      3
4            5               3              -1                      0


In [4]:
label_df=pd.read_csv("/home/avdhoot/Documents/labels.csv")
print(label_df.shape)


(115, 2)


In [8]:
print(cluster_data)

     File_Number  KMeans_Cluster  DBSCAN_Cluster  Agglomerative_Cluster
0              1               1              -1                      2
1              2               1              -1                      2
2              3               0              -1                      1
3              4               2              -1                      3
4              5               3              -1                      0
..           ...             ...             ...                    ...
111          112               2              -1                      3
112          113               2              -1                      3
113          114               0              -1                      1
114          115               4              -1                      3
115          116               4              -1                      4

[116 rows x 4 columns]


In [2]:
import numpy as np
import pandas as pd
import os
import librosa
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Define feature extraction function
def extract_features(audio_file, n_mfcc=20):
    y, sr = librosa.load(audio_file, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    features = []

    # Compute statistical features for first 5 MFCC coefficients
    for i in range(5):
        coeff = mfcc[i, :]
        features.extend([
            np.mean(coeff), 
            np.var(coeff), 
            pd.Series(coeff).skew(), 
            pd.Series(coeff).kurtosis(), 
            np.sqrt(np.mean(coeff ** 2)),  # RMS
            np.ptp(coeff)  # Peak-to-peak ratio
        ])
    return features

# Paths to labeled folders and their labels
data_dir = "/home/avdhoot/third_sem/ds203/project/param_songs"  # Replace with the actual path
categories = {
    "asha bhosale": 0,
    "kishore kumar": 1,
    "marathi bhavgeet": 2,
    "marathi lavni": 3,
    "mj": 4,
    "na": 5
}

# Collect features and labels
labeled_features = []
labeled_labels = []

for category, label in categories.items():
    category_path = os.path.join(data_dir, category)
    for audio_file in os.listdir(category_path):
        if audio_file.endswith(".mp3") or audio_file.endswith(".wav"):  # Adjust for your audio formats
            file_path = os.path.join(category_path, audio_file)
            features = extract_features(file_path)
            labeled_features.append(features)
            labeled_labels.append(label)

# Convert to DataFrame for labeled data
labeled_df = pd.DataFrame(labeled_features)
labeled_df['label'] = labeled_labels

# Load unlabeled MFCC files and extract same features
unlabeled_dir = "/home/avdhoot/third_sem/ds203/project/MFCC-files-v2"  # Replace with actual path
unlabeled_features = []

for i in range(1, 117):
    file_path = os.path.join(unlabeled_dir, f"{i}.csv")
    mfcc_df = pd.read_csv(file_path, header=None).transpose()

    # Compute features for first 5 MFCCs as before
    file_features = []
    for j in range(5):
        coeff = mfcc_df.iloc[:, j].values
        file_features.extend([
            np.mean(coeff), 
            np.var(coeff), 
            pd.Series(coeff).skew(), 
            pd.Series(coeff).kurtosis(), 
            np.sqrt(np.mean(coeff ** 2)),  # RMS
            np.ptp(coeff)  # Peak-to-peak ratio
        ])
    unlabeled_features.append(file_features)

# Convert to DataFrame for unlabeled data
unlabeled_df = pd.DataFrame(unlabeled_features)

# Standardize features
scaler = StandardScaler()
X_labeled = scaler.fit_transform(labeled_df.drop(columns=['label']))
X_unlabeled = scaler.transform(unlabeled_df)
y_labeled = labeled_df['label'].values

# Split labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

# Train SVM
svm = SVC(kernel='linear', C=1, random_state=42)
svm.fit(X_train, y_train)

# Evaluate model on test set
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy * 100:.2f}%")

# Predict labels for unlabeled data
unlabeled_predictions = svm.predict(X_unlabeled)

# Save predictions to a CSV file
unlabeled_results = pd.DataFrame({
    'File_Number': list(range(1, 117)),
    'Predicted_Label': unlabeled_predictions
})
unlabeled_results.to_csv("unlabeled_predictions.csv", index=False)

print("Predictions saved to 'unlabeled_predictions.csv'")


Test Set Accuracy: 66.67%
Predictions saved to 'unlabeled_predictions.csv'


In [3]:
df5=pd.read_csv("unlabeled_predictions.csv")
print(df5)

     File_Number  Predicted_Label
0              1                2
1              2                2
2              3                4
3              4                3
4              5                2
..           ...              ...
111          112                3
112          113                3
113          114                4
114          115                2
115          116                2

[116 rows x 2 columns]


In [4]:
import numpy as np
import pandas as pd
import os
import librosa
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Define feature extraction function
def extract_features(audio_file, n_mfcc=20):
    y, sr = librosa.load(audio_file, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    features = []

    # Compute statistical features for first 5 MFCC coefficients
    for i in range(5):
        coeff = mfcc[i, :]
        features.extend([
            np.mean(coeff), 
            np.var(coeff), 
            pd.Series(coeff).skew(), 
            pd.Series(coeff).kurtosis(), 
            np.sqrt(np.mean(coeff ** 2)),  # RMS
            np.ptp(coeff)  # Peak-to-peak ratio
        ])
    return features

# Paths to labeled folders and their labels
data_dir = "/home/avdhoot/third_sem/ds203/project/param_songs"  # Replace with the actual path
categories = {
    "asha bhosale": 0,
    "kishore kumar": 1,
    "marathi bhavgeet": 2,
    "marathi lavni": 3,
    "mj": 4,
    "na": 5
}

# Collect features and labels
labeled_features = []
labeled_labels = []

for category, label in categories.items():
    category_path = os.path.join(data_dir, category)
    for audio_file in os.listdir(category_path):
        if audio_file.endswith(".mp3") or audio_file.endswith(".wav"):  # Adjust for your audio formats
            file_path = os.path.join(category_path, audio_file)
            features = extract_features(file_path)
            labeled_features.append(features)
            labeled_labels.append(label)

# Convert to DataFrame for labeled data
labeled_df = pd.DataFrame(labeled_features)
labeled_df['label'] = labeled_labels

# Load unlabeled MFCC files and extract same features
unlabeled_dir = "/home/avdhoot/third_sem/ds203/project/MFCC-files-v2"  # Replace with actual path
unlabeled_features = []

for i in range(1, 117):
    file_path = os.path.join(unlabeled_dir, f"{i}.csv")
    mfcc_df = pd.read_csv(file_path, header=None).transpose()

    # Compute features for first 5 MFCCs as before
    file_features = []
    for j in range(5):
        coeff = mfcc_df.iloc[:, j].values
        file_features.extend([
            np.mean(coeff), 
            np.var(coeff), 
            pd.Series(coeff).skew(), 
            pd.Series(coeff).kurtosis(), 
            np.sqrt(np.mean(coeff ** 2)),  # RMS
            np.ptp(coeff)  # Peak-to-peak ratio
        ])
    unlabeled_features.append(file_features)

# Convert to DataFrame for unlabeled data
unlabeled_df = pd.DataFrame(unlabeled_features)

# Standardize features
scaler = StandardScaler()
X_labeled = scaler.fit_transform(labeled_df.drop(columns=['label']))
X_unlabeled = scaler.transform(unlabeled_df)
y_labeled = labeled_df['label'].values

# Split labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

# Train SVM
svm = SVC(kernel='rbf', C=1, random_state=42)
svm.fit(X_train, y_train)

# Evaluate model on test set
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy * 100:.2f}%")

# Predict labels for unlabeled data
unlabeled_predictions = svm.predict(X_unlabeled)

# Save predictions to a CSV file
unlabeled_results = pd.DataFrame({
    'File_Number': list(range(1, 117)),
    'Predicted_Label': unlabeled_predictions
})
unlabeled_results.to_csv("unlabeled_predictions.csv", index=False)

print("Predictions saved to 'unlabeled_predictions.csv'")


Test Set Accuracy: 66.67%
Predictions saved to 'unlabeled_predictions.csv'


In [6]:
df5=pd.read_csv("/home/avdhoot/Documents/labels.csv")
print(df5.shape)

(115, 2)


In [7]:
import os
import pandas as pd
import numpy as np
import librosa
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# Paths to directories
labeled_data_folder = '/home/avdhoot/third_sem/ds203/project/param_songs'  # Replace with actual path
unlabeled_mfcc_folder = '/home/avdhoot/third_sem/ds203/project/MFCC-files-v2'  # Replace with actual path

# Categories and labels for the labeled dataset
categories = {
    'na': 0,
    'marathi lavni': 1,
    'marathi bhavgeet': 2,
    'asha bhosale': 3,
    'mj': 4,
    'kishore kumar': 5
}

# Function to extract features from audio files
def extract_audio_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    features = []
    for i in range(5):  # First 5 MFCCs
        features.extend([np.mean(mfcc[i]), np.var(mfcc[i]), np.std(mfcc[i]), 
                         pd.Series(mfcc[i]).skew(), pd.Series(mfcc[i]).kurt(), 
                         np.sqrt(np.mean(mfcc[i]**2)), np.ptp(mfcc[i])])
    return features

# Prepare labeled training data
train_features = []
train_labels = []

for category, label in categories.items():
    category_folder = os.path.join(labeled_data_folder, category)
    for file_name in os.listdir(category_folder):
        if file_name.endswith('.mp3'):
            file_path = os.path.join(category_folder, file_name)
            features = extract_audio_features(file_path)
            train_features.append(features)
            train_labels.append(label)

train_features = np.array(train_features)
train_labels = np.array(train_labels)

# Standardize features
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

# Reduce dimensionality with PCA if desired
pca = PCA(n_components=20)
train_features = pca.fit_transform(train_features)

# Train SVM model
svm_model = SVC(kernel='rbf')  # RBF kernel can capture non-linear relations
svm_model.fit(train_features, train_labels)

# Prepare validation data
df5 = pd.DataFrame()  # Assuming df5 is loaded with file numbers and labels
validation_features = []
validation_labels = []

for _, row in df5.iterrows():
    file_number = int(row[0])
    label = int(row[1])
    file_path = os.path.join(unlabeled_mfcc_folder, f"{file_number}.csv")
    
    # Load MFCCs directly from the CSV files
    mfcc_df = pd.read_csv(file_path, header=None)
    mfcc_df = mfcc_df.transpose()
    
    # Extract features from the first 5 MFCCs
    features = []
    for i in range(5):
        mfcc_coeff = mfcc_df[i].values
        features.extend([np.mean(mfcc_coeff), np.var(mfcc_coeff), np.std(mfcc_coeff),
                         pd.Series(mfcc_coeff).skew(), pd.Series(mfcc_coeff).kurt(),
                         np.sqrt(np.mean(mfcc_coeff**2)), np.ptp(mfcc_coeff)])
    
    validation_features.append(features)
    validation_labels.append(label)

validation_features = np.array(validation_features)
validation_labels = np.array(validation_labels)

# Standardize validation data
validation_features = scaler.transform(validation_features)
validation_features = pca.transform(validation_features)

# Predict and evaluate
predictions = svm_model.predict(validation_features)
accuracy = accuracy_score(validation_labels, predictions)

print(f"Validation Accuracy: {accuracy * 100:.2f}%")


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [8]:
print(validation_features.shape)
print(train_features.shape)

(0,)
(27, 20)


In [11]:
import os
import pandas as pd
import numpy as np
import librosa
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# Paths to directories
labeled_data_folder = '/home/avdhoot/third_sem/ds203/project/param_songs'  # Replace with actual path
unlabeled_mfcc_folder = '/home/avdhoot/third_sem/ds203/project/data/MFCC-files-v2'  # Replace with actual path

# Categories and labels for the labeled dataset
categories = {
    'na': 0,
    'marathi lavni': 1,
    'marathi bhavgeet': 2,
    'asha bhosale': 3,
    'mj': 4,
    'kishore kumar': 5
}

# Function to extract features from audio files
def extract_audio_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    features = []
    for i in range(5):  # First 5 MFCCs
        features.extend([np.mean(mfcc[i]), np.var(mfcc[i]), np.std(mfcc[i]), 
                         pd.Series(mfcc[i]).skew(), pd.Series(mfcc[i]).kurt(), 
                         np.sqrt(np.mean(mfcc[i]**2)), np.ptp(mfcc[i])])
    return features

# Prepare labeled training data
train_features = []
train_labels = []

for category, label in categories.items():
    category_folder = os.path.join(labeled_data_folder, category)
    for file_name in os.listdir(category_folder):
        if file_name.endswith('.mp3'):
            file_path = os.path.join(category_folder, file_name)
            features = extract_audio_features(file_path)
            train_features.append(features)
            train_labels.append(label)

train_features = np.array(train_features)
train_labels = np.array(train_labels)

# Standardize features
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

# Reduce dimensionality with PCA if desired
pca = PCA(n_components=20)
train_features = pca.fit_transform(train_features)

# Train SVM model
svm_model = SVC(kernel='rbf')  # RBF kernel can capture non-linear relations
svm_model.fit(train_features, train_labels)

# Prepare validation data
validation_features = []
validation_labels = []

for _, row in df5.iterrows():
    file_number = int(row[0])
    label = int(row[1])
    file_path = os.path.join(unlabeled_mfcc_folder, f"{file_number}.csv")
    
    # Check if file exists
    if not os.path.isfile(file_path):
        print(f"File {file_path} not found, skipping.")
        continue
    
    # Load MFCCs directly from the CSV files
    mfcc_df = pd.read_csv(file_path, header=None)
    
    # Check if the file is empty or has fewer rows than expected
    if mfcc_df.empty or mfcc_df.shape[0] < 20:
        print(f"File {file_path} is empty or has insufficient rows, skipping.")
        continue
    
    # Transpose to have frames on columns and coefficients in rows
    mfcc_df = mfcc_df.transpose()
    
    # Extract features from the first 5 MFCCs
    features = []
    for i in range(5):
        mfcc_coeff = mfcc_df[i].values
        features.extend([np.mean(mfcc_coeff), np.var(mfcc_coeff), np.std(mfcc_coeff),
                         pd.Series(mfcc_coeff).skew(), pd.Series(mfcc_coeff).kurt(),
                         np.sqrt(np.mean(mfcc_coeff**2)), np.ptp(mfcc_coeff)])
    
    validation_features.append(features)
    validation_labels.append(label)

# Convert to NumPy array if there are any valid entries
if validation_features:
    validation_features = np.array(validation_features)
    validation_labels = np.array(validation_labels)

    # Standardize validation data
    validation_features = scaler.transform(validation_features)
    validation_features = pca.transform(validation_features)

    # Predict and evaluate
    predictions = svm_model.predict(validation_features)
    accuracy = accuracy_score(validation_labels, predictions)

    print(f"Validation Accuracy: {accuracy * 100:.2f}%")
else:
    print("No valid validation data available.")


No valid validation data available.


In [1]:
import os
import shutil

# Define paths
neural_data_folder = '/home/avdhoot/third_sem/ds203/project/neural_data'  # Replace with your actual path
new_folder = os.path.join(neural_data_folder, 'kk')
os.makedirs(new_folder, exist_ok=True)

# Loop through items in the neural data folder
for item in os.listdir(neural_data_folder):
    item_path = os.path.join(neural_data_folder, item)
    
    # Check if it's a standalone file (not a folder)
    if os.path.isfile(item_path):
        shutil.copy(item_path, new_folder)

print("Standalone files have been copied to 'kk'.")


Standalone files have been copied to 'kk'.


In [2]:
import os

# Define the path to your neural data folder
neural_data_folder = '/home/avdhoot/third_sem/ds203/project/neural_data'  # Replace with your actual path

# Loop through items in the neural data folder
for item in os.listdir(neural_data_folder):
    item_path = os.path.join(neural_data_folder, item)
    
    # Check if it's a standalone file (not a folder) and delete it
    if os.path.isfile(item_path):
        os.remove(item_path)
        print(f"Deleted file: {item}")

print("Standalone files have been deleted, folders are preserved.")


Deleted file: Ban Ke Gulgule - Apna Haath Jagnnath 128 Kbps.mp3
Deleted file: Bachna Ae Hasinon Lo Main Aa Gaya - Hum Kisise Kum Naheen 128 Kbps.mp3
Deleted file: Badi Sooni Sooni Hai Zindagi - Mili 128 Kbps.mp3
Deleted file: Apni To Jaise Taise - Laawaris 128 Kbps.mp3
Deleted file: Chahiye Thoda Pyar - Lahu Ke Do Rang 128 Kbps.mp3
Deleted file: Aapke Anurodh Pe - Anurodh 128 Kbps.mp3
Deleted file: Chal Sapnon Ke Shahar Mein - Deewangi 1976 128 Kbps.mp3
Deleted file: Ae-Oh-Aa-Zara-Mudke-From-Disco-Dancer-Kishore-Kumar.mp3
Deleted file: Aaye Tum Yaad Mujhe - Mili 128 Kbps.mp3
Deleted file: Aaj-Ei-Dintake-Kishore-Kumar.mp3
Deleted file: Andheri Raaton Mein - Shahenshah 128 Kbps.mp3
Deleted file: Chal Chal Chal Mere Saathi - Haathi Mere Saathi (1971) 128 Kbps.mp3
Deleted file: Chala Jata Hoon - Mere Jeevan Saathi 128 Kbps.mp3
Deleted file: Chahe-Koi-Khush-Ho-Kishore-Kumar.mp3
Deleted file: Aise Na Mujhe - Darling Darling 128 Kbps.mp3
Deleted file: Aaj Unse Pehli Mulaqat Hogi - Paraya Dhan