# Loading data

## Multiple CSVs and JSONs

In [2]:
def file_loader():
    #Creating list of filenames
    csv_files = glob('*.csv')
    json_files = glob('*.json')

    #Loading files into variables
    df_list = list(map(lambda z: pd.read_csv(z,index_col='video_id'),
                                                                 csv_files))
    britain_js, germany_js, canada_js, france_js, usa_js = list(map(lambda a: json.load(open(a,'r')), 
                                                                                json_files))

## Loading .mat files

In [3]:
from scipy.io import loadmat
 
def mat_loader():    
    mnist_path = "./mnist-original.mat"
    mnist_raw = loadmat(mnist_path)
    mnist = {
        "data": mnist_raw["data"].T,
        "target": mnist_raw["label"][0],
        "COL_NAMES": ["label", "data"],
        "DESCR": "mldata.org dataset: mnist-original",
    }

# EDA and Visualization

## Null Checker for columns in DataFrame

In [4]:
def null_checker(data):
    column_list = data.columns
    Null_dict = {}   
    for column in column_list:
        Null_value_count = data[data[column].isnull() == True].count()[1]
        Null_dict[column] = Null_value_count
    
    print(Null_dict)

## Computing ECDF

In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""

    # Number of data points: n
    n = len(data)     

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / n

    return x, y

## ECDF with Percentiles 

In [None]:
# Plot the ECDF
_ = plt.plot(x_vers, y_vers, '.')
plt.margins(0.02)
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')

# Overlay percentiles as red diamonds.
_ = plt.plot(ptiles_vers, percentiles/100, marker='D', color='red',
         linestyle='none')

# Show the plot
plt.show()

## Performing Bernoulli Trials

In [None]:
def perform_bernoulli_trials(n, p):
    """Perform n Bernoulli trials with success probability p
    and return number of successes."""
    # Initialize number of successes: n_success
    n_success = 0


    # Perform trials
    for i in range(n):
        # Choose random number between zero and one: random_number
        random_number = np.random.random()

        # If less than p, it's a success so add one to n_success
        if random_number<p:
            n_success+=1

    return n_success


## Generating Bootstrap Samples and plotting them alongside original data

In [None]:
for i in range(50):
    # Generate bootstrap sample: bs_sample
    bs_sample = np.random.choice(rainfall, size=len(rainfall))

    # Compute and plot ECDF from bootstrap sample
    x, y = ecdf(bs_sample)
    _ = plt.plot(x, y, marker='.', linestyle='none',
                 color='gray', alpha=0.1)

# Compute and plot ECDF from original data
x, y = ecdf(rainfall)
_ = plt.plot(x, y, marker='.')

# Make margins and label axes
plt.margins(0.02)
_ = plt.xlabel('yearly rainfall (mm)')
_ = plt.ylabel('ECDF')

# Show the plot
plt.show()


## Generating Bootstrap replicates with specified function

In [None]:
def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))

def draw_bs_reps(data, func, size=1):
    """Draw bootstrap replicates."""

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate1d(data, func)

    return bs_replicates


## Generating Pair Bootstrap Replicates

In [None]:
def draw_bs_pairs_linreg(x, y, size=1):
    """Perform pairs bootstrap for linear regression."""

    # Set up array of indices to sample from: inds
    inds = np.arange(len(x))

    # Initialize replicates: bs_slope_reps, bs_intercept_reps
    bs_slope_reps = np.empty(size)
    bs_intercept_reps = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_inds = np.random.choice(inds, size=len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x,bs_y                                                             ,deg=1)

    return bs_slope_reps, bs_intercept_reps


## Performing Linear Regression on Bootstrap Replicates

In [None]:
def draw_bs_pairs_linreg(x, y, size=1):
    """Perform pairs bootstrap for linear regression."""

    # Set up array of indices to sample from: inds
    inds = np.arange(len(x))

    # Initialize replicates: bs_slope_reps, bs_intercept_reps
    bs_slope_reps = np.empty(size)
    bs_intercept_reps = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_inds = np.random.choice(inds, size=len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x,bs_y                                                             ,deg=1)

    return bs_slope_reps, bs_intercept_reps


## Regression Plot using Bootstrap Replicates

In [None]:
# Generate array of x-values for bootstrap lines: x
x = np.array([0,100])

# Plot the bootstrap lines
for i in range(100):
    _ = plt.plot(x, bs_slope_reps[i]*x + bs_intercept_reps[i],
                 linewidth=0.5, alpha=0.2, color='red')

# Plot the data
_ = plt.plot(illiteracy, fertility, linestyle='none', marker='.')

# Label axes, set the margins, and show the plot
_ = plt.xlabel('illiteracy')
_ = plt.ylabel('fertility')
plt.margins(0.02)
plt.show()

## Generating Permutation Samples

In [None]:
def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate((data1, data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2

## Generating Permutation Replicates

In [None]:
def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""

    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)

    return perm_replicates

## Calculating p-value

In [None]:
def diff_of_means(data_1, data_2):
    """Difference in means of two arrays."""

    # The difference of means of data_1, data_2: diff
    diff = np.mean(data_1) - np.mean(data_2)

    return diff

# Compute difference of mean impact force from experiment: empirical_diff_means
empirical_diff_means = diff_of_means(force_a, force_b)

# Draw 10,000 permutation replicates: perm_replicates
perm_replicates = draw_perm_reps(force_a, force_b,
                                 diff_of_means, size=10000)

# Compute p-value: p
p = np.sum(perm_replicates >= empirical_diff_means) / len(perm_replicates)

# Print the result
print('p-value =', p)


## Shifting data using new mean

In [None]:
translated_force_b = force_b - old_mean + new_mean

## Pie chart

In [5]:
def pie_maker():
    py3_users = df.iloc[:,42][df.iloc[:,42]=='Python 3'].count()
    py2_users = df.iloc[:,42][df.iloc[:,42]=='Python 2'].count()

    plt.pie(x=[py3_users,py2_users], labels=['Python 3', 'Python 2'])
    plt.show()

## Seaborn Visualization

In [6]:
def visualizer(x, y, plot_type, title, xlabel, ylabel, rotation=False, rotation_value=60, figsize=(15,8)):
    plt.figure(figsize=figsize)
    
    if plot_type == "bar":  
        sns.barplot(x=x, y=y)
    elif plot_type == "count":  
        sns.countplot(x)
    elif plot_type == "reg":  
        sns.regplot(x=x,y=y)
    plt.title(title, fontsize=16)
    plt.xlabel(xlabel, fontsize=14)
    plt.ylabel(ylabel, fontsize=14)
    
    if rotation == True:
        plt.xticks(rotation=rotation_value)
    plt.show()

## 3D Visualization

In [None]:
from mpl_toolkits.mplot3d import Axes3D

def three_dplot():
    subplot3d = plt.subplot(111, projection='3d')
    x_coords, y_coords, z_coords = zip(a,b)
    subplot3d.scatter(x_coords, y_coords, z_coords)
    subplot3d.set_zlim3d([0, 9])
    plt.show()

def plot_vectors3d(ax, vectors3d, z0, **options):
    for v in vectors3d:
        x, y, z = v
        ax.plot([x,x], [y,y], [z0, z], color="gray", linestyle='dotted', marker=".")
    x_coords, y_coords, z_coords = zip(*vectors3d)
    ax.scatter(x_coords, y_coords, z_coords, **options)

"""
subplot3d = plt.subplot(111, projection='3d')
subplot3d.set_zlim([0, 9])
plot_vectors3d(subplot3d, [a,b], 0, color=("r","b"))
plt.show()
"""

# Preparing Data

## Stratified Sampling

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

def stratified_sampler(data, column, n_splits=1, test_size=0.2, random_state=42):
    split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
    
    for train_index, test_index in split.split(data, data[column]):
        strat_test_set = data.loc[test_index]
        strat_train_set = data.loc[train_index]
    return (strat_train_set, strat_test_set)

## Stratified K-fold split

In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

def stratifed_k_fold():
    for train_index, test_index in skfolds.split(X_train, y_train_5):
        clone_clf = clone(sgd)
        X_train_folds = X_train[train_index]
        y_train_folds = (y_train_5[train_index])
        X_test_fold = X_train[test_index]
        y_test_fold = (y_train_5[test_index])
        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred)) 

## Dual Label Encoder for Preprocessing

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class dual_encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self):
        return self
    def transform(self):
        pass
    def fit_transform(self, X, y=None):
        self.encoder_cont = LabelEncoder()
        cont_encoded = self.encoder_cont.fit_transform(X['cont_rating'])
        
        self.encoder_prime_genre = LabelEncoder()
        genre_encoded = self.encoder_prime_genre.fit_transform(X['prime_genre'])
        
        X["cont_encoded"] = cont_encoded
        X["genre_encoded"] = genre_encoded
        
        return X.drop(["cont_rating", "prime_genre"], axis=1)
    

## DataFrame selector for Pipeline

In [10]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

## Adding Polynomial feature for PolyRegression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
def poly():
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly_features.fit_transform(X)

    lin_reg = LinearRegression()
    lin_reg.fit(X_poly, y)
    lin_reg.intercept_, lin_reg.coef_

## Pipelines and FeatureUnion

In [11]:
def pipeline_and_featureunion():
    from sklearn.pipeline import Pipeline, FeatureUnion
    from sklearn.preprocessing import LabelEncoder

    category_attributes = ["cont_rating","prime_genre"]
    numerical_attributes = store_data.drop(["cont_rating","prime_genre"], axis=1).columns

    numline = Pipeline([("dataframe", DataFrameSelector(numerical_attributes)),
                        ("dropper", dropper()),
                        ("version-trimmer", version_trimmer())])

    encoder = dual_encoder()

    catline = Pipeline([("dataframe", DataFrameSelector(category_attributes)),
                        ("cat-encoder", encoder)])

    full_pipeline = FeatureUnion(transformer_list=[("num_pipeline", numline),
                                                   ("cat_pipeline", catline)])

    store_data_prepared = full_pipeline.fit_transform(store_data)

    print(store_data_prepared)    

# Algorithms and models

## Early stopping or beautiful free lunch

In [None]:
from sklearn.base import clone

def early_stopper():
    sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty=None,
    learning_rate="constant", eta0=0.0005)
    minimum_val_error = float("inf")
    best_epoch = None
    best_model = None

    for epoch in range(1000):
        sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off
        y_val_predict = sgd_reg.predict(X_val_poly_scaled)
        val_error = mean_squared_error(y_val_predict, y_val)
        if val_error < minimum_val_error:
            minimum_val_error = val_error
            best_epoch = epoch
            best_model = clone(sgd_reg)

# Testing Models

## Scoring ML models

In [12]:
from sklearn.model_selection import cross_val_score

#Scoring ML model(Using Negative root mean squared error) made easy
def model_scoring(model_name, model, X, y):
    
    #Cross Validation
    scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=10)
    
    #Scores
    rmse = np.sqrt(-scores)
    mean = rmse.mean()
    std = rmse.std()
    print(model_name)
    print()
    print("RMSE: {}".format(rmse))
    print("MEAN: {}".format(mean))
    print("STD: {}".format(std))

## Calculating training error and validation error

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")

# After Analysis

## Pickling ML models alongwith metrics and train-test data

In [13]:
def pickler(name, model, predictions, training_data, training_label=None, cross_val_score=None, **kwargs):
    
    #Making dicitonary
    import numpy as np
    pickle_dict = {}
    pickle_name = name+".pickle"
    pickle_dict["ML Model"] = model
    pickle_dict["Predictions"] = np.array(predictions)
    pickle_dict["Training Data"] = np.array(training_data)
    pickle_dict["Training Labels"] = np.array(training_label)
    for key, value in kwargs.items():
        pickle_dict[key] = value
    
    #Pickling the dictionary
    from sklearn.externals import joblib
    joblib.dump(pickle_dict, pickle_name)