In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

In [None]:
df_train = pd.read_csv(r'C:\Users\casocha\Desktop\Kaggle\music\train.csv/train.csv')
df_train.head()

In [None]:
df_song = pd.read_csv(r'C:\Users\casocha\Desktop\Kaggle\music\songs.csv/songs.csv')
df_song.head()

In [None]:
df_mems = pd.read_csv(r'C:\Users\casocha\Desktop\Kaggle\music\members.csv/members.csv')
df_mems.head()

In [None]:
df_test = pd.read_csv(r'C:\Users\casocha\Desktop\Kaggle\music\members.csv/test.csv')
df_test.head()

In [None]:
print(len(df_train),len(df_song),len(df_mems))

In [None]:
df_mems['registration_year'] = df_mems['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
df_mems['registration_month'] = df_mems['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
df_mems['registration_date'] = df_mems['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

df_mems['expiration_year'] = df_mems['expiration_date'].apply(lambda x: int(str(x)[0:4]))
df_mems['expiration_month'] = df_mems['expiration_date'].apply(lambda x: int(str(x)[4:6]))
df_mems['expiration_date'] = df_mems['expiration_date'].apply(lambda x: int(str(x)[6:8]))
df_mems = df_mems.drop(['registration_init_time', 'expiration_date'], axis=1)

In [None]:
df_train['source_system_tab'].unique()

In [None]:
df_train['source_type'].unique()

In [None]:
df_train['source_screen_name'].unique()

In [None]:
pal = 'husl'

In [None]:
fig, axarr = plt.subplots(1, 1, figsize=(12, 8))
sns.countplot(x="source_system_tab", data=df_train, palette=pal, hue='target')
axarr.set_xticklabels(axarr.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
fig, axarr = plt.subplots(1, 1, figsize=(12, 8))
sns.countplot(x="source_type", data=df_train, hue='target', palette=pal)
axarr.set_xticklabels(axarr.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.close(2)

In [None]:
df_train = pd.merge(left = df_train,right = df_mems,how='left',on='msno')
df_train = pd.merge(left = df_train,right = df_song,how = 'left',on='song_id')
df_train.song_length.fillna(-1,inplace=True)

In [None]:
df_test = pd.merge(left = df_test,right = df_mems,how='left',on='msno')
df_test = pd.merge(left = df_test,right = df_song,how = 'left',on='song_id')
df_test.song_length.fillna(-1,inplace=True)

In [None]:
df_test.info()

In [None]:
fig, axarr = plt.subplots(1, 1, figsize=(12, 8))

cor = df_train.corr() #Calculate the correlation of the above variables
mask = np.zeros_like(cor)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(cor, mask = mask,square = True, cmap="YlGnBu") #Plot the correlation as heat map
plt.title('Heatmap of Attribute Correlation with Survived')

In [None]:
fig, axarr = plt.subplots(1, 1, figsize=(12, 8))
sns.kdeplot(df_train.loc[:,'bd'], bw=2, label='Age')
plt.title('Age distribution')

In [None]:
df_int = df_train[df_train['bd'] < 110 ]
print(len(df_test))

In [None]:
fig, axarr = plt.subplots(1, 1, figsize=(12, 8))
sns.kdeplot(df_int.loc[:,'bd'], bw=2, label='Age')
plt.title('Age distribution without outliers')

In [None]:
fig, axarr = plt.subplots(1, 1, figsize=(12, 8))
sns.countplot(x="language", data=df_train,  palette=pal)
axarr.set_xticklabels(axarr.get_xticklabels(), rotation=40, ha="right")
plt.title('Song count by language')
plt.tight_layout()
plt.close(2)

In [None]:
fig, axarr = plt.subplots(1, 1, figsize=(12, 8))
sns.countplot(x="bd", data=df_train,  palette=pal)
axarr.set_xticklabels(axarr.get_xticklabels(), rotation=60, ha="right")
plt.title('Song count by language')
plt.tight_layout()
plt.close(2)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
cols = list(df_train.columns)
cols.remove('target')

for col in cols:
    if df_train[col].dtype == 'object':
        df_train[col] = df_train[col].apply(str)
        df_test[col] = df_test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(df_train[col].unique())
        test_vals = list(df_test[col].unique())
        le.fit(train_vals + test_vals)
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])

        print(col + ': ' + str(len(train_vals)) + ', ' + str(len(test_vals)))

print(df_train.head())
print(df_test.head())

out_test = np.array(df_test.drop(['id'], axis=1))
ids = df_test['id'].values


### XGBoost

In [None]:
X_train = np.array(df_train.drop(['target'], axis=1))
y_train = df_train['target'].values

X_test = np.array(df_test.drop(['id'], axis=1))
ids = df_test['id'].values

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [None]:
import xgboost as xgb

In [None]:
train_dmat = xgb.DMatrix(data=X_train, label=y_train)

In [None]:
dtest = xgb.DMatrix(X_valid,y_valid)

In [None]:
final_test = xgb.DMatrix(out_test)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5, 10],
    'learning_rate': [0.001, 0.01, 0.1]
    
}

gbm = xgb.XGBClassifier(nthread=-1)

grid_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=gbm,scoring='roc_auc',cv=3,verbose=1, n_jobs=-1)

grid_mse.fit(X_train,y_train)

print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

### Predict / Train

In [None]:
params = {"booster":"gbtree","objective":"binary:logistic", "max_depth":10}
params['silent'] = 1
params['eval_metric'] = 'auc'
params['colsample_bytree'] = 0.3

model = xgb.train(params, train_dmat, maximize=True, verbose_eval=5)

In [None]:
p_test = model.predict(final_test)

# Prepare submission
print(len(ids), len(p_test))
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test

In [None]:
subm.to_csv(r'C:\Users\casocha\Desktop\Kaggle\music\train.csv/submission.csv', index=False)

In [None]:
len(subm)