In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
from google.cloud import bigquery
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
#Initialize BigQuery Client
client = bigquery.Client()

#Pull your engineered dataset from BigQuery into pandas
sql = ("SELECT * FROM  `yourGCPproject.yourDataset.YourTable`")
df = client.query(sql).to_dataframe()

In [None]:
print("Data Shape:", df.shape)

In [None]:
df.groupby(['TARGET_CLASSIFICATION']).size()

In [None]:
#fill Null values with 0
df = df.fillna(0)

In [None]:
#comment this out for full run.  This takes 10% of the original frame
df_small = df.sample(frac=0.1)

In [None]:
print("Data Shape:", df_small.shape)

In [None]:
df_small.head()

In [None]:
#Examine correlation matrix
plt.matshow(df_small.corr())
plt.show()

In [None]:
#Examine correlation matrix #2
def plot_corr(df,size=7):
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns);
    plt.yticks(range(len(corr.columns)), corr.columns);
corr = df_small.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#Examine Scatterplots of select features vs a numeric variable silimar to target feature
sns.set(style="white")
pp = sns.pairplot(data=df_small, hue='TARGET_CLASSIFICATION', 
                  y_vars=['TARGET_RELATED_DATA'],
                  x_vars=['prospect_feature1', 'prospect_feature2', 'prospect_feature3', 'prospect_feature4'])

In [None]:
#Examine 1 large scatterplot
#Repeat this plot as EDA for any feature you want to examine where scatterplot works
f, ax = plt.subplots(figsize=(10, 10))
sns.despine(f, left=True, bottom=True)

sns.scatterplot(x="TARGET_RELATED_DATA", y="prospect_feature1",
                hue="TARGET_CLASSIFICATION",
                palette="ch:r=-.2,d=.3_r",
                sizes=(1, 8), linewidth=0,
                data=df_small, ax=ax)

In [None]:
sns.set(style="whitegrid", palette="pastel", color_codes=True)

# Draw a nested violinplot and split the violins for easier comparison with categorical prospect feature vs TARGET_CLASSIFICATION
sns.violinplot(x="prospect_feature1", y="prospect_feature2", hue="TARGET_CLASSIFICATION",
               split=True, inner="quart",
               data=df_small)
sns.despine(left=True)

In [None]:
ppp = sns.pairplot(data=df_small, hue='TARGET_CLASSIFICATION', y_vars=['TARGET_RELATED_DATA'],
                  x_vars=['prospect_feature5', 'prospect_feature6', 'prospect_feature7', 'prospect_feature8'])

In [None]:
pppp = sns.pairplot(data=df_small, hue='TARGET_CLASSIFICATION', y_vars=['TARGET_RELATED_DATA'],
                  x_vars=['prospect_feature9', 'prospect_feature10', 'prospect_feature11', 'prospect_feature12'])

In [None]:
#Specify your selected features form the prospect data set here
features_to_select = ['prospect_feature1', 'prospect_feature2', 'prospect_feature3', 'prospect_feature4',
                      'prospect_feature5', 'prospect_feature6', 'prospect_feature7', 'prospect_feature8',
                      'prospect_feature9', 'prospect_feature10', 'prospect_feature11', 'prospect_feature12']

#seperate target vs input features
X = df[features_to_select]
y = df['TARGET_CLASSIFICATION']

#split 70/30 train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

#Fit Model
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)

#Predict using model
predictions = gbm.predict(X_test)

#show accuracy of prediction
accuracy = accuracy_score(y_test, predictions)