# Baseline Submission for the Challenge YPMSD

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ayushshivani/aicrowd_educational_baselines/blob/master/YPMSD_baseline.ipynb)


## Import necessary packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Download Dataset

In [None]:
!wget https://s3.eu-central-1.wasabisys.com/aicrowd-public-datasets/aicrowd_educational_sngyr/train.zip
!wget https://s3.eu-central-1.wasabisys.com/aicrowd-public-datasets/aicrowd_educational_ypmsd/data/public/test.csv
!unzip train.zip

## Load Data

In [2]:
train_data = pd.read_csv('train.csv')

## Clean and analyse the data

In [3]:
train_data.head()

Unnamed: 0,year,timbre_mean_0,timbre_mean_1,timbre_mean_2,timbre_mean_3,timbre_mean_4,timbre_mean_5,timbre_mean_6,timbre_mean_7,timbre_mean_8,...,timbre_cov_68,timbre_cov_69,timbre_cov_70,timbre_cov_71,timbre_cov_72,timbre_cov_73,timbre_cov_74,timbre_cov_75,timbre_cov_76,timbre_cov_77
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [4]:
train_data.describe()

Unnamed: 0,year,timbre_mean_0,timbre_mean_1,timbre_mean_2,timbre_mean_3,timbre_mean_4,timbre_mean_5,timbre_mean_6,timbre_mean_7,timbre_mean_8,...,timbre_cov_68,timbre_cov_69,timbre_cov_70,timbre_cov_71,timbre_cov_72,timbre_cov_73,timbre_cov_74,timbre_cov_75,timbre_cov_76,timbre_cov_77
count,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,...,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0,463715.0
mean,1998.386095,43.385488,1.261091,8.650195,1.130763,-6.512725,-9.565527,-2.384609,-1.793722,3.714584,...,15.743361,-73.067753,41.423976,37.780868,0.345259,17.59928,-26.364826,4.444985,19.739307,1.323326
std,10.939767,6.079139,51.613473,35.26475,16.334672,22.85582,12.836758,14.580245,7.961876,10.579241,...,32.086356,175.376872,121.79461,94.874474,16.153797,114.336522,174.187892,13.320996,184.843503,22.045404
min,1922.0,1.749,-337.0925,-301.00506,-154.18358,-181.95337,-81.79429,-188.214,-72.50385,-126.47904,...,-437.72203,-4402.37644,-1810.68919,-3098.35031,-341.78912,-3168.92457,-4319.99232,-236.03926,-7458.37815,-318.22333
25%,1994.0,39.95754,-26.15381,-11.44192,-8.515155,-20.63696,-18.468705,-10.77634,-6.4614,-2.3036,...,-1.798085,-139.062035,-20.918635,-4.71147,-6.75816,-31.563615,-101.396245,-2.57283,-59.59803,-8.813335
50%,2002.0,44.26257,8.37155,10.47052,-0.69161,-5.99274,-11.20885,-2.04785,-1.73544,3.81684,...,9.16136,-52.87801,28.70987,33.49455,0.82835,15.55449,-21.12357,3.11112,7.58695,0.05284
75%,2006.0,47.83365,36.14378,29.741165,8.756995,7.74959,-2.42259,6.51571,2.90513,9.95096,...,26.24829,13.62066,89.419995,77.6747,8.495715,67.743725,52.29985,9.948955,86.203115,9.67074
max,2011.0,61.97014,384.06573,322.85143,289.52743,262.06887,119.81559,172.40268,105.21028,146.29795,...,840.97338,4469.45487,3210.7017,1672.6471,260.5449,3662.06565,2833.60895,463.4195,7393.39844,600.76624


## Split Data for Train and Validation

In [5]:
X = train_data.drop('year',1)
y = train_data['year']
# Validation testing
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Define the Classifier and Train

In [6]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Check which variables have the most impact

In [7]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])  
coeff_df.head()

Unnamed: 0,Coefficient
timbre_mean_0,0.873376
timbre_mean_1,-0.055835
timbre_mean_2,-0.043576
timbre_mean_3,0.004539
timbre_mean_4,-0.015032


## Predict on validation

In [8]:
y_pred = regressor.predict(X_val)

In [9]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred})
df1 = df.head(25)
df1

Unnamed: 0,Actual,Predicted
332595,2004,2002.979558
230573,1989,1996.446079
364530,1987,1995.333451
82857,2002,1998.16332
108108,1971,1998.303355
446568,2005,2000.499458
27815,2004,1995.818434
214974,1997,1999.666288
304899,2006,2005.025704
257881,2007,1998.581968


## Evaluate the Performance

In [10]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

Mean Absolute Error: 6.77395050034565
Mean Squared Error: 90.87071514117896
Root Mean Squared Error: 9.53261323778422


## Load Test Set

In [11]:
test_data = pd.read_csv('test.csv')

In [12]:
test_data.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 81,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87,Unnamed: 88,Unnamed: 89,Unnamed: 90
0,45.442,-30.74976,31.78587,4.63569,-15.14894,0.2337,-11.97968,-9.59708,6.48111,-8.89073,...,-8.84046,-0.15439,137.4421,77.54739,-4.22875,-61.92657,-33.52722,-3.86253,36.424,7.17309
1,52.67814,-2.88914,43.95268,-1.39209,-14.93379,-15.86877,1.19379,0.31401,-4.44235,-5.78934,...,-5.74356,-42.5791,-2.91103,48.72805,-3.08183,-9.38888,-7.27179,-4.00966,-68.96211,-5.21525
2,45.74235,12.02291,11.03009,-11.60763,11.80054,-11.12389,-5.39058,-1.11981,-7.74086,-3.33421,...,-4.70606,-24.22599,-35.22686,27.77729,15.38934,58.20036,-61.12698,-10.92522,26.75348,-5.78743
3,52.55883,2.87222,27.38848,-5.76235,-15.35766,-15.01592,-5.86893,-0.31447,-5.06922,-4.62734,...,-8.35215,-16.86791,-10.58277,40.10173,-0.54005,-11.54746,-45.3586,-4.55694,-43.17368,-3.33725
4,51.34809,9.02702,25.33757,-6.62537,0.03367,-12.69565,-3.134,2.98649,-6.7175,-1.85804,...,-6.87366,-20.03371,-66.3894,50.56569,0.27747,67.05657,-55.58846,-7.50859,28.23511,-0.72045


## Predict on test set

In [13]:
y_test = regressor.predict(test_data)

Since its integer regression, convert to integers

In [14]:
y_inttest = [int(i) for i in y_test]
y_inttest = np.asarray(y_inttest)

## Save it in correct format

In [15]:
df = pd.DataFrame(y_inttest,columns=['year'])
df.to_csv('submission.csv',index=False)

## To download the generated in collab csv run the below command

In [None]:
from google.colab import files
files.download('submission.csv') 

To participate in the challenge click [here](https://www.aicrowd.com/challenges/ypmsd-music-year-prediction)