Topics covered:
- LinearRegression
- PolynomialFeatures
- Ridge
- MinMaxScaler
- Lasso
- SVR
- GridSearchCV
- Regularization
- DecisionTreeRegressor

From the crime dataset found here
https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized


below are the columns from the dataset:

predictive_columns = ['population' 'householdsize' 'racepctblack' 'racePctWhite' 'racePctAsian'
 'racePctHisp' 'agePct12t21' 'agePct12t29' 'agePct16t24' 'agePct65up'
 'numbUrban' 'pctUrban' 'medIncome' 'pctWWage' 'pctWFarmSelf' 'pctWInvInc'
 'pctWSocSec' 'pctWPubAsst' 'pctWRetire' 'medFamInc' 'perCapInc'
 'whitePerCap' 'blackPerCap' 'indianPerCap' 'AsianPerCap' 'OtherPerCap'
 'HispPerCap' 'NumUnderPov' 'PctPopUnderPov' 'PctLess9thGrade'
 'PctNotHSGrad' 'PctBSorMore' 'PctUnemployed' 'PctEmploy' 'PctEmplManu'
 'PctEmplProfServ' 'PctOccupManu' 'PctOccupMgmtProf' 'MalePctDivorce'
 'MalePctNevMarr' 'FemalePctDiv' 'TotalPctDiv' 'PersPerFam' 'PctFam2Par'
 'PctKids2Par' 'PctYoungKids2Par' 'PctTeen2Par' 'PctWorkMomYoungKids'
 'PctWorkMom' 'NumKidsBornNeverMar' 'PctKidsBornNeverMar' 'NumImmig'
 'PctImmigRecent' 'PctImmigRec5' 'PctImmigRec8' 'PctImmigRec10'
 'PctRecentImmig' 'PctRecImmig5' 'PctRecImmig8' 'PctRecImmig10'
 'PctSpeakEnglOnly' 'PctNotSpeakEnglWell' 'PctLargHouseFam'
 'PctLargHouseOccup' 'PersPerOccupHous' 'PersPerOwnOccHous'
 'PersPerRentOccHous' 'PctPersOwnOccup' 'PctPersDenseHous' 'PctHousLess3BR'
 'MedNumBR' 'HousVacant' 'PctHousOccup' 'PctHousOwnOcc' 'PctVacantBoarded'
 'PctVacMore6Mos' 'MedYrHousBuilt' 'PctHousNoPhone' 'PctWOFullPlumb'
 'OwnOccLowQuart' 'OwnOccMedVal' 'OwnOccHiQuart' 'OwnOccQrange' 'RentLowQ'
 'RentMedian' 'RentHighQ' 'RentQrange' 'MedRent' 'MedRentPctHousInc'
 'MedOwnCostPctInc' 'MedOwnCostPctIncNoMtg' 'NumInShelters' 'NumStreet'
 'PctForeignBorn' 'PctBornSameState' 'PctSameHouse85' 'PctSameCity85'
 'PctSameState85' 'LemasSwornFT' 'LemasSwFTPerPop' 'LemasSwFTFieldOps'
 'LemasSwFTFieldPerPop' 'LemasTotalReq' 'LemasTotReqPerPop'
 'PolicReqPerOffic' 'PolicPerPop' 'RacialMatchCommPol' 'PctPolicWhite'
 'PctPolicBlack' 'PctPolicHisp' 'PctPolicAsian' 'PctPolicMinor'
 'OfficAssgnDrugUnits' 'NumKindsDrugsSeiz' 'PolicAveOTWorked' 'LandArea'
 'PopDens' 'PctUsePubTrans' 'PolicCars' 'PolicOperBudg'
 'LemasPctPolicOnPatr' 'LemasGangUnitDeploy' 'LemasPctOfficDrugUn'
 'PolicBudgPerPop']
 
 
 target_columns:['murders' 'murdPerPop' 'rapes' 'rapesPerPop' 'robberies'
 'robbbPerPop' 'assaults' 'assaultPerPop' 'burglaries' 'burglPerPop'
 'larcenies' 'larcPerPop' 'autoTheft' 'autoTheftPerPop' 'arsons'
 'arsonsPerPop' 'ViolentCrimesPerPop' 'nonViolPerPop']

In [None]:
def get_crime_dataset():
    import pandas as pd
    # Communities and Crime dataset for regression
    # source:
    # https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized

    df = pd.read_table('CommViolPredUnnormalizedData.txt', sep=',', na_values='?')
    
    # drop columns for city, state, etc. plus rows with values = na
    crime = df.drop(df.columns[[0,1,2,3,4]],axis=1).dropna()
    
    # n columns based on the index
#     X_crime = crime.iloc[:,range(0,10)]
    
    # all predictive columns
#     X_crime = crime.iloc[:,range(0,124)]

    # select columns from a list
    X_crime = crime[['PctPopUnderPov','racepctblack','racePctWhite','racePctAsian',
 'racePctHisp','population','medIncome','PctKidsBornNeverMar']]
    
    # select just one column, will need to do a reshape
#     X_crime = crime['perCapInc'].values.reshape(-1,1)
    
    #your exercise here
#     X_crime = crime[[]]

    # select any one column from the target columns
    y_crime = crime['burglPerPop']

    return (X_crime,y_crime)

In [None]:
def printDataSet():
    (X_crime,y_crime) = get_crime_dataset()
    print(X_crime.head())
    print(y_crime.head())
    
printDataSet()

In [None]:
def returnColumnNames():
    (X_crime,y_crime) = get_crime_dataset()
    print('X: ', X_crime.columns.values, '\ny: ',y_crime.name)
    
returnColumnNames()
    

In [None]:
#let's get a baseset with a dummy regressor

def ex0():
    import warnings
    warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
    import numpy as np
    from sklearn.dummy import DummyRegressor
    from sklearn.model_selection import train_test_split
    
    (X_crime,y_crime) = get_crime_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
                                                   random_state = 0)
    dummy = DummyRegressor().fit(X_train, y_train)

    #linreg.score gives the R2 score
    return (dummy.score(X_train, y_train),dummy.score(X_test, y_test))

ex0()

In [None]:
#perform linear regression

def ex1():
    import warnings
    warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    
    (X_crime,y_crime) = get_crime_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
                                                   random_state = 0)
    linreg = LinearRegression().fit(X_train, y_train)

    #linreg.score gives the R2 score
    return (linreg.score(X_train, y_train),linreg.score(X_test, y_test))

ex1()

In [None]:
# perform polynomial regression of degree 2
#  similar as prior exercise
def ex1a():
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    
    (X_crime,y_crime) = get_crime_dataset()
    
    X_poly = PolynomialFeatures(degree=2).fit_transform(X_crime)
    
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y_crime,
                                                   random_state = 0)
    linreg = LinearRegression().fit(X_train, y_train)

    return (linreg.score(X_train, y_train),linreg.score(X_test, y_test))

ex1a()

In [None]:
# to avoid the problems of overfitting polynomial regression of higher degrees, 
# we put a penalty on the coefficients that are large.
# We use ridge regression which is a type of regularized linear regression 
# that uses an alpha parameter 
# to penalize for large coefs theta (to avoid overfitting):
# As the magnitudes of the xi parameters increases (the higher the polynomial degree), 
# the penalty increases as well 
def ex2():
    
    import numpy as np
    from sklearn.linear_model import Ridge
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    
    (X_crime,y_crime) = get_crime_dataset()
    
    X_poly = PolynomialFeatures(degree=2).fit_transform(X_crime)
    
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y_crime,
                                                   random_state = 0)
    linridge = Ridge(alpha=20.0).fit(X_train, y_train)

    return (linridge.score(X_train, y_train),linridge.score(X_test, y_test))

ex2()

In [None]:
# when features vary wildly, e.g when calculating the price of the house: 
# the square footage is in the thousands
# and number of bedrooms is in the single digits, it's best to normalize the data 
# to values between 0 and 1 or -1 and 1, you use MinMaxScaler on most 
# occasions - make sure there are no outliers
def ex2a():
    
    import numpy as np
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.linear_model import Ridge
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    
    scaler = MinMaxScaler()
    (X_crime,y_crime) = get_crime_dataset()
    X_poly = PolynomialFeatures(degree=2).fit_transform(X_crime)
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y_crime,
                                                   random_state = 0)
    # both training set and testing set need to be scaled
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

    return (linridge.score(X_train_scaled, y_train),linridge.score(X_test_scaled, y_test))

ex2a()

In [None]:
# use min max scaler and ridge regressor with alpha values in 
# [0, 1, 10, 20, 50, 100, 1000] with a polynomial of degree 3
# find the best alpha
def ex3():

    import warnings
    warnings.filterwarnings(action="ignore", module="scipy")
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import Ridge
    import numpy as np
    
    scaler = MinMaxScaler()
    (X_crime,y_crime) = get_crime_dataset()
    X_poly = PolynomialFeatures(degree=3).fit_transform(X_crime)
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y_crime,
                                                       random_state = 0)

    # both training set and testing set need to be scaled
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
        linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)
        r2_train = linridge.score(X_train_scaled, y_train)
        r2_test = linridge.score(X_test_scaled, y_test)
        print('Alpha = {:.2f}\nr-squared training: {:.2f}, r-squared test: {:.2f}'
             .format(this_alpha, r2_train, r2_test))

ex3()

In [None]:
# Lasso Regression
# another way of doing regularization is using the Lasso Regression, which also penalizes 
# the coeficients when doing the regression
# when to use Lasso vs Ridge?
# find which features have the most effect (use DecisionTreeRegressor explained below)
# Many small/medium sized effects: use Ridge
# Only a few variables with medium/large effects: use Lasso
#
# do a Lasso regression for alpha in [0.1, 0.5, 1, 2, 3, 5, 10, 20, 50,100] 
# and max_iter = 10000 and polynomial of degree 4
def ex4():

    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import Lasso
    import numpy as np
    
    scaler = MinMaxScaler()
    (X_crime,y_crime) = get_crime_dataset()
    X_poly = PolynomialFeatures(degree=4).fit_transform(X_crime)
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y_crime,
                                                       random_state = 0)

    # both training set and testing set need to be scaled
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    for alpha in [0.5, 1, 2, 3, 5,10,20,50,100]:
        linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)
        r2_train = linlasso.score(X_train_scaled, y_train)
        r2_test = linlasso.score(X_test_scaled, y_test)
        print('Alpha = {:.2f}\nr-squared training: {:.2f}, r-squared test: {:.2f}'
             .format(alpha, r2_train, r2_test))

ex4()

In [None]:
# Support Vector Machines: transformation of the data before finding a match
# C = penalty parameter
def ex5():

    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split
    from sklearn.svm import SVR
    import numpy as np
    
    scaler = MinMaxScaler()
    (X_crime,y_crime) = get_crime_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
                                                       random_state = 0)

    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = SVR(C=10, kernel='linear').fit(X_train_scaled, y_train)
    r2_train = clf.score(X_train_scaled, y_train)
    r2_test = clf.score(X_test_scaled, y_test)
    print(r2_train,r2_test)

ex5()

In [None]:
# Use GridSearchCV to run SVR with different parameters
# fold cross validation cv: folds the data 
def ex6():

    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV
    from sklearn.svm import SVR
    import numpy as np
    
    scaler = MinMaxScaler()
    (X_crime,y_crime) = get_crime_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
                                                       random_state = 0)

    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    svr = GridSearchCV(SVR( gamma=0.1), cv=5,
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3], 'kernel':['rbf','linear','poly']})
    
    clf = svr.fit(X_train_scaled, y_train)
    print(clf.best_params_)
    r2_train = clf.score(X_train_scaled, y_train)
    r2_test = clf.score(X_test_scaled, y_test)
    print(r2_train,r2_test)

ex6()

In [None]:
def ex7():
    
    import numpy as np
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.model_selection import train_test_split
    
    scaler = MinMaxScaler()
    (X_crime,y_crime) = get_crime_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
                                                       random_state = 0)
    # both training set and testing set need to be scaled
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    tree = DecisionTreeRegressor(max_depth=2).fit(X_train_scaled, y_train)
    print(tree.feature_importances_)
    returnColumnNames()
    return (tree.score(X_train_scaled, y_train),tree.score(X_test_scaled, y_test))

ex7()

In [None]:
# exercise: try to predict 'ViolentCrimesPerPop' or 'nonViolPerPop'
# using one or many parameters
# try to get an r2 score of over 60 with no overfitting
# try to get an r2 score of over 80 (with or without overfitting)
# will you rather use the overfitting because it gives you a better score?
# what are the features that are most important?