<a href="https://colab.research.google.com/github/adamlutzz/DS-Unit-2-Regression-Classification/blob/master/Regression_and_Classification_Study_Guide.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regression & Classification Sprint Challenge Study Guide

## Part 1, Classification

In [0]:
# Always start with a baseline for classification
y_train = df_labels['y_labels'] # df_labels is a dummy df with just your labels

# check value counts for y_train
y_train.value_counts(normalize=True)

# declare majority class from value counts
majority_class = y_train.mode()[0] # indexed at zero to get the first or top number for the modes.

# multiply majority class by the lenght of labels essentially guessing the majority for every label
y_pred = [majority_class] * len(y_train) # this is a baseline prediction

### Accuracy Score

In [0]:
# accuracy of majority class baseline = frequency of the majority class
from sklearn.metrics import accuracy_score

# apply accuracy score from your baseline predictions(y_pred) and your training labels (y_train)
accuracy_score(y_train, y_pred)

### Train/Validate/Test Split

In [0]:
#  3-way holdout method (train/validation/test split) Validation is used when you do not have the test data to train against. Its like doing a train/test split on your train data

# split by time
# use pandas like how you split up the rent dataset

# split randomly from sklearn library
from sklearn.model_selection import train_test_split

# create X_train
X_train = train_features
y_train = df_features['y_target'] # I used 'y_target' as a generic label

# look at shape
X_train.shape, y_train.shape

# train/test split function
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, train_size = 0.80, test_size = 0.20,
    stratify = y_train, random_state=11
)

# look at new shapes
X_train.shape, X_val.shape, y_train.shape, y_val.shape

### Fit a Logistical Regression Model

In [0]:
# start with numeric features by dropping non-numeric features
X_train_numeric = X_train.select_dtypes('number')

# also drop from validation
X_val_numeric = X_val.select_dtypes('number')

# check for null values typically drop if you have them
X_train_numeric.isnull().sum() # double check zeros or other strange characters in data for inputs that are supposed to be NA (0, ?, None, etc). Check the documentation for datasets

In [0]:
# Fit Logistic Regression on train data (trying to improve from baseline)
from sklearn.linear_model import LogisticRegressionCV

# instantiate model
model - LogisticRegressionCV(n_jobs=-1) # n_jobs argument set to -1 speeds up the process by utilizing all processors in computer

# fit model
model.fit(X_train_numeric, y_train)

In [0]:
# evaluate model on validation data
from sklearn.metrics import accuracy_score

# y_pred is made from running the model on our X validation set
y_pred = model.predict(X_val_numeric)

# accuracy score
accuracy_score(y_val, y_pred)

In [0]:
# one liner of code above
model.score(X_val_numeric, y_val)

### One Hot Encoding

Adds dimension for each unique value in a given categorical feature

*   Works best with features that have low cardinality




In [0]:
# determine cardinality in dataframe
X_train.describe(exclude='numeric').T.sort_values(by='unique') # transpose makes it easier to view, and sort by unique

# choose feature 'example' with a desirable cardinality and explore feature
X_train['example'].value_counts(normalize=True)

# temporarily combine X_train and y_train for exploration
train = X_train.copy()
train['y_label'] = y_train

# groupby
train.groupby('example')('y_label').value_counts(normalize=True)

In [0]:
# plot for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# add numeric feature for labels so it can plot
train['y_label_0'] = (train['status_group']== 'y_label_0').astype(int)
# train[['y_label', 'y_label_0']] # use this code to spot check

# plot
sns.catplot(x='example', y='y_label_0', data=train, kind='bar', color='grey')
plt.title('Percentage of subject with label 0 by example')
plt.show()

In [0]:
# use category encoders for one-hot encoding and Standard Scaler for scaling
import category_encoders as ce
from sklearn import preprocessing import StandardScaler

# make list of features wanted
categorical_features = ['example']
numeric_features = X_train.select_dtypes('number').columns.drop('not_part').tolist() # drop columns that are not actually part of the data
features = categorical_features + numeric_features

# define subset of X_train
X_train_subset = X_train[features]
X_val_subset = X_val[features]

# create encoder
encoder = ce.OneHotEncoder(use_cat_names=True)

# create encoded objects
X_train_encoded = encoder.fit_transform(X_train_subset)
X_val_encoded = encoder.transform(X_val_subset)

# add scaler
scaler = StandardScaler()

# scale X_train/X_val
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_val_scaled = scaler.fit_transform(X_val_encoded)

# instantiate model
model = LogisticRegressionCV(n_jobs=-1)

# fit model
model.fit(X_train_scaled, y_train)

# output
print('Validation Accuracy', model.score(X_val_scaled, y_val))

## Part 2, Regression

### Baselines for Regression

In [0]:
# get mean for target value
df['y_target'].mean()

### Train/Validate/Test Split

In [0]:
# by time
cutoff = pd.to_datetime('2016-06-01') # change this date to your cutoff date
train = df[df.created < cutoff]
test = df[df.created >= cutoff]

In [0]:
# split randomly from sklearn library
from sklearn.model_selection import train_test_split

# create X_train
X_train = train_features
y_train = df_features['y_target'] # I used 'y_target' as a generic label

# look at shape
X_train.shape, y_train.shape

# train/test split function
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, train_size = 0.80, test_size = 0.20,
    stratify = y_train, random_state=11
)

# look at new shapes
X_train.shape, X_val.shape, y_train.shape, y_val.shape

### Fit Linear Regression Model

In [0]:
# import function
from sklearn.linear_model import LinearRegression

# instantiate class
model = LinearRegression()

# arrange X features matrix and y target vector
features = ['feature_0', 'feature_1', 'feature_3']
target = 'target'
X_train = df_train[features]
y_train = df_train[target]

# fit model
model.fit(X_train, y_train)

# apply model
y_pred = model.predict(X_train)

### Plot Actual vs Predicted

In [0]:
# import plotting packages
import matplotlib.pyplot as plt

# actual plot
plt.scatter(X_train, y_train)

# prediction line
plt.plot(X_train, y_pred)
plt.show()

### Calculate Coefficients and Intercept


In [0]:
# calculate coefficients
model.coef_[0]

# calculate intercept
model.intercept_