In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
df = pd.read_csv('/content/drive/MyDrive/ML-Project-files/survey_results.csv')

In [10]:
df.head()

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Permissions
0,8/27/2022 19:29:02,18.0,Spotify,3.0,Yes,Yes,Yes,Latin,Yes,Yes,...,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,I understand.
1,8/27/2022 19:57:31,63.0,Pandora,1.5,Yes,No,No,Rock,Yes,No,...,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,,I understand.
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,I understand.
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve,I understand.
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,I understand.


In [11]:
# Select the desired columns and drop the unwanted columns
columns_to_keep = ['Age', 'Anxiety', 'Depression', 'Insomnia', 'OCD', 'Fav genre']
df = df.loc[:, columns_to_keep]

# Now your DataFrame contains only the selected columns

In [12]:
from sklearn.preprocessing import LabelEncoder
# Display the first few rows of your data to verify it's loaded correctly
print(df.head())
# Create a label encoder

genre_encoder = LabelEncoder()

df['Fav genre'] = genre_encoder.fit_transform(df['Fav genre'])
# The 'Fav genre' column is now encoded with numerical values
print("\n after encoding.. \n")
df.head()
number_of_encoded_genres = df['Fav genre'].unique()
print(number_of_encoded_genres)

    Age  Anxiety  Depression  Insomnia  OCD         Fav genre
0  18.0      3.0         0.0       1.0  0.0             Latin
1  63.0      7.0         2.0       2.0  1.0              Rock
2  18.0      7.0         7.0      10.0  2.0  Video game music
3  61.0      9.0         7.0       3.0  3.0              Jazz
4  18.0      7.0         2.0       5.0  9.0               R&B

 after encoding.. 

[ 8 14 15  6 12  7  1  2  5 11 13  0 10  3  9  4]


In [13]:
# Data cleaning

# Handle missing values (replace with mean, median, or mode, as needed)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Anxiety'].fillna(df['Anxiety'].median(), inplace=True)
df['Depression'].fillna(df['Depression'].median(), inplace=True)
df['Insomnia'].fillna(df['Insomnia'].median(), inplace=True)
df['OCD'].fillna(df['OCD'].median(), inplace=True)

# Detect and handle outliers (using Z-score as an example)
from scipy import stats
z_scores = stats.zscore(df[['Age', 'Anxiety', 'Depression', 'Insomnia', 'OCD']])
df = df[(z_scores < 3).all(axis=1)]  # Remove rows with outliers

# Check the data distribution and other anomalies

In [14]:
from sklearn.preprocessing import MinMaxScaler

# Create a Min-Max scaler
scaler = MinMaxScaler()

# Define the columns to be scaled
columns_to_scale = ['Age', 'Anxiety', 'Depression', 'Insomnia', 'OCD']

# Apply Min-Max scaling to the selected columns
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

# Now, the 'Age', 'Anxiety', 'Depression', 'Insomnia', and 'OCD' columns are scaled to [0, 1]

print("\n After rescaling using min-max\n")
print(df.head())


 After rescaling using min-max

        Age  Anxiety  Depression  Insomnia  OCD  Fav genre
0  0.156863      0.3         0.0       0.1  0.0          8
2  0.156863      0.7         0.7       1.0  0.2         15
3  1.000000      0.9         0.7       0.3  0.3          6
4  0.156863      0.7         0.2       0.5  0.9         12
5  0.156863      0.8         0.8       0.7  0.7          6


In [15]:
from sklearn.model_selection import train_test_split

# Specify the features and target variable
X = df[['Age', 'Anxiety', 'Depression', 'Insomnia', 'OCD']]
y = df['Fav genre']

# Split the data into training and validation sets
X_train, y_train = X, y

# X_train and y_train are your training data
# X_validation and y_validation are your validation data

In [16]:
# Converting to tensor for training in  tensorflow

# Convert Pandas DataFrames to TensorFlow tensors
X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.int64)  # Assuming 'Fav genre' is encoded as integers


In [25]:
num_unique_genres = len(y_train.unique())
# Create a sequential model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, input_shape=(5,), activation='relu'),  # Example dropout layer
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_unique_genres, activation='softmax')
])

opt = tf.keras.optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train_tensor, y_train_tensor, epochs=200, batch_size=32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [28]:
# To save the trained model
def download_history():
  import pickle
  from google.colab import files

  with open('history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

  files.download('history.pkl')

download_history()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
# To retrive it for predictions
# Load the saved model from file
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Use the loaded model for prediction
predictions = loaded_model.predict(input_data)