## Categorical variables

In [126]:

# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline


In [60]:
# reading data
df = pd.read_csv('data/survey_results_public.csv')


In [66]:
# Only use quant variables and drop any rows with missing values
num_vars = df[['Salary', 'CareerSatisfaction', 'HoursPerWeek', 'JobSatisfaction', 'StackOverflowSatisfaction']]
num_vars.head()

Unnamed: 0,Salary,CareerSatisfaction,HoursPerWeek,JobSatisfaction,StackOverflowSatisfaction
0,,,0.0,,9.0
1,,,,,8.0
2,113750.0,8.0,,9.0,8.0
3,,6.0,5.0,3.0,10.0
4,,6.0,,8.0,


In [67]:
# drop rows: salary with missing values 
drop_sal_df = num_vars.dropna(subset=['Salary'], axis=0)

# mean function 
fill_mean = lambda col: col.fillna(col.mean())

# fill mean
fill_df = drop_sal_df.apply(fill_mean, axis=0)

In [75]:
# 
print("Number of values before treating null values : {}".format(num_vars.shape[0]))
print("Number of values after treating null values : {}".format(fill_df.shape[0]))

Number of values before treating null values : 19102
Number of values after treating null values : 5009


In [76]:
# Splitting into targer and features variables
X = fill_df[['CareerSatisfaction', 'HoursPerWeek', 'JobSatisfaction', 'StackOverflowSatisfaction']]
y = fill_df['Salary']

In [77]:
# Splitting into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# creating linear regression model 
lm_model = LinearRegression(normalize = True)
lm_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [82]:
# Predict the score 

y_test_preds = lm_model.predict(X_test)
"The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for the model using only quantitative variables was 0.02860568830900767 on 1002 values.'

Question 1: Use the df dataframe. Identify the columns that are categorical in nature. How many of the columns are considered categorical? 

In [93]:
cat_df = df.select_dtypes(include=['object'])
cat_df.shape[1]

147

Question 2: Use cat_df and the cells below to fill in the dictionary with correct values.

In [94]:
np.sum(np.sum(cat_df.isnull()) / cat_df.shape[0] == 0)

6

In [95]:
np.sum(np.sum(cat_df.isnull()) / cat_df.shape[0] > 0.5)

49

In [96]:
np.sum(np.sum(cat_df.isnull()) / cat_df.shape[0] > 0.75)

13

In [97]:
# Provide the key as an `integer` that answers the question

cat_df_dict = {'the number of columns with no missing values': 6  ,
               'the number of columns with more than half of the column missing': 49,
               'the number of columns with more than 75% of the column missing': 13
}

Question 3: After you have created dummy_var_df, use the additional cells to fill in the sol_3_dict with the correct variables that match each key.

In [102]:
dummy_var_df = pd.DataFrame({'col1': ['a', 'a', 'b', 'b', 'a', np.nan, 'b', np.nan],
                             'col2': [1, np.nan, 3, np.nan, 5, 6, 7, 8] 
})
                            
dummy_var_df

Unnamed: 0,col1,col2
0,a,1.0
1,a,
2,b,3.0
3,b,
4,a,5.0
5,,6.0
6,b,7.0
7,,8.0


In [103]:
pd.get_dummies(dummy_var_df['col1'])

Unnamed: 0,a,b
0,1,0
1,1,0
2,0,1
3,0,1
4,1,0
5,0,0
6,0,1
7,0,0


In [104]:
b = 2
c = 3
d = 'col1'
e = 'col2'
f = 'the rows with NaNs are dropped by default'
g = 'the NaNs are always encoded as 0'


sol_3_dict = {'Which column should you create a dummy variable for?': d,
              'When you use the default settings for creating dummy variables, how many are created?': b,
              'What happens with the nan values?': g 
             }


Question 4: Create a new encoding for col1 of dummy_var_df that provides dummy columns not only for each level, but also for the missing values below. Store the resulting columns in dummy_cols_df. 

In [107]:
dummy_cols = pd.get_dummies(dummy_var_df['col1'], dummy_na=True)
dummy_cols

Unnamed: 0,a,b,NaN
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,1,0,0
5,0,0,1
6,0,1,0
7,0,0,1


In [108]:
# Question 5: 

In [121]:
def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df

In [122]:
# droping where the salary has missing values
df = df.dropna(subset=['Salary'], axis=0)

# pull a list of column names of the categorical variables
cat_df = df.select_dtypes(include=['object'])
cat_cols_lst = cat_df.columns

df_new = create_dummy_df(df, cat_cols_lst, dummy_na=False)

# show a header of df_new to check
print(df_new.shape)

(5009, 11938)


Question 6: Use the document string below to complete the function.  

In [123]:
def clean_fit_linear_mod(df, response_col, cat_cols, dummy_na, test_size=.3, rand_state=42):
    '''
    INPUT:
    df - a dataframe holding all the variables of interest
    response_col - a string holding the name of the column 
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    test_size - a float between [0,1] about what proportion of data should be in the test dataset
    rand_state - an int that is provided as the random state for splitting the data into training and test 
    
    OUTPUT:
    test_score - float - r2 score on the test data
    train_score - float - r2 score on the test data
    lm_model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    
    Your function should:
    1. Drop the rows with missing response values
    2. Drop columns with NaN for all the values
    3. Use create_dummy_df to dummy categorical columns
    4. Fill the mean of the column for any missing values 
    5. Split your data into an X matrix and a response vector y
    6. Create training and test sets of data
    7. Instantiate a LinearRegression model with normalized data
    8. Fit your model to the training data
    9. Predict the response for the training data and the test data
    10. Obtain an rsquared value for both the training and test data
    '''
    #Drop the rows with missing response values
    df  = df.dropna(subset=[response_col], axis=0)

    #Drop columns with all NaN values
    df = df.dropna(how='all', axis=1)

    #Dummy categorical variables
    df = create_dummy_df(df, cat_cols, dummy_na)

    # Mean function
    fill_mean = lambda col: col.fillna(col.mean())
    # Fill the mean
    df = df.apply(fill_mean, axis=0)

    #Split into explanatory and response variables
    X = df.drop(response_col, axis=1)
    y = df[response_col]

    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=rand_state)

    lm_model = LinearRegression(normalize=True) # Instantiate
    lm_model.fit(X_train, y_train) #Fit

    #Predict using your model
    y_test_preds = lm_model.predict(X_test)
    y_train_preds = lm_model.predict(X_train)

    #Score using your model
    test_score = r2_score(y_test, y_test_preds)
    train_score = r2_score(y_train, y_train_preds)

    return test_score, train_score, lm_model, X_train, X_test, y_train, y_test


#Test your function with the above dataset
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(df_new, 'Salary', cat_cols_lst, dummy_na=False)

In [124]:
# Print training and testing score
print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))

The rsquared on the training data was 1.0.  The rsquared on the test data was 0.45302723304174586.
