In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline

df = pd.read_csv('./sample1.csv')
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,2,6,2019-05-26,0.13,1,5
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,3,0,,,1,188
2,8741020,Voted #1 Location Quintessential 1BR W Village...,45854238,John,Manhattan,West Village,40.73631,-74.00611,Entire home/apt,245,3,51,2018-09-19,1.12,1,0
3,34602077,Spacious 1 bedroom apartment 15min from Manhattan,261055465,Regan,Queens,Astoria,40.76424,-73.92351,Entire home/apt,125,3,1,2019-05-24,0.65,1,13
4,23203149,Big beautiful bedroom in huge Bushwick apartment,143460,Megan,Brooklyn,Bushwick,40.69839,-73.92044,Private room,65,2,8,2019-06-23,0.52,2,8


In [2]:

%matplotlib inline
%load_ext autoreload
%autoreload 2


#Only use quant variables and drop any rows with missing values

num_vars = df.select_dtypes(include=['number']).copy()

#Drop the rows with missing salaries
drop_sal_df = num_vars.dropna(subset=['price'], axis=0)

# Mean function
fill_mean = lambda col: col.fillna(col.mean())
# Fill the mean
fill_df = drop_sal_df.apply(fill_mean, axis=0)

#Split into explanatory and response variables
X = fill_df.drop(columns=['price'])
y = fill_df['price']

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

lm_model = LinearRegression() # Instantiate
lm_model.fit(X_train, y_train) #Fit
        
#Predict and score the model
y_test_preds = lm_model.predict(X_test) 
"The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for the model using only quantitative variables was 0.0335493569804699 on 6000 values.'

In [3]:
cat_df = df.select_dtypes(include=['object']).copy()
cat_df.head()
# Print how many categorical columns are in the dataframe - should be 147
cat_df.shape[1]

6

In [4]:
#Create a copy of the dataframe
cat_df_copy = cat_df.copy()
#Pull a list of the column names of the categorical variables
cat_cols_lst = cat_df.columns
print(cat_cols_lst)

Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood',
       'room_type', 'last_review'],
      dtype='object')


In [5]:
def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in cat_cols:
        #print("working on column" + " " + col)
        df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_')], axis=1)
    df.head() 
    return df

In [6]:
# TODO: Drop rows where salary has missing values

df = df.dropna(subset=['price'], axis=0)

df_new = create_dummy_df(df, cat_cols_lst, dummy_na=False) #Use your newly created function


# Show shape to compare with initial shape
print(df_new.shape)
print(df.shape)

(20000, 28027)
(20000, 16)


In [None]:
from sklearn.preprocessing import StandardScaler

def clean_fit_linear_mod(df, response_col, cat_cols, dummy_na, test_size=.2, rand_state=31):
    '''
    INPUT:
    df - a dataframe holding all the variables of interest
    response_col - a string holding the name of the column 
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    test_size - a float between [0,1] about what proportion of data should be in the test dataset
    rand_state - an int that is provided as the random state for splitting the data into training and test 
    
    OUTPUT:
    test_score - float - r2 score on the test data
    train_score - float - r2 score on the test data
    lm_model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    
    '''
    #Drop the rows with missing response values
    df  = df.dropna(subset=[response_col], axis=0)
    

    #Drop columns with all NaN values
    df = df.dropna(how='all', axis=1)
   

    #Dummy categorical variables
    df = create_dummy_df(df, cat_cols, dummy_na)
 

    # Mean function
    fill_mean = lambda col: col.fillna(col.mean())
    # Fill the mean
    df = df.apply(fill_mean, axis=0)


    #Split into explanatory and response variables
    X = df.drop(response_col, axis=1)
    y = df[response_col]
    

    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=rand_state)


    lm_model = LinearRegression() # Instantiate
    print("lm model instantiated")
    # Initialize the StandardScaler to normalize the features
    scaler = StandardScaler()
    print("scaler instantiated")
    
    # Fit and transform the explanatory variables (X)
    X_scaled = scaler.fit_transform(X_train)
    print(X_scaled)
    
    # Fit the model using the scaled features
    lm_model.fit(X_scaled, y_train)
    #lm_model.fit(X_train, y_train) #Fit


    #Predict using your model
    #y_test_preds = lm_model.predict(X_test)
    #y_train_preds = lm_model.predict(X_train)

    #Score using your model
    #test_score = r2_score(y_test, y_test_preds)
    #train_score = r2_score(y_train, y_train_preds)

    #return test_score, train_score, lm_model, X_train, X_test, y_train, y_test


#test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(df, 'price', cat_cols_lst, dummy_na=False)

xmen = clean_fit_linear_mod(df, 'price', cat_cols_lst, dummy_na=False)

lm model instantiated
scaler instantiated
[[-1.01907038 -0.76896584 -0.14642878 ... -0.11165119 -0.12182801
  -0.04682196]
 [-0.66965561 -0.6753397  -1.85734921 ... -0.11165119 -0.12182801
  -0.04682196]
 [ 0.32574764 -0.5565615   1.79329557 ... -0.11165119 -0.12182801
  -0.04682196]
 ...
 [ 1.36163872 -0.77247553 -0.78905027 ... -0.11165119 -0.12182801
  -0.04682196]
 [ 1.48146927  0.8655579   0.51772291 ... -0.11165119 -0.12182801
  -0.04682196]
 [ 1.09739099  2.09350205 -2.33666968 ... -0.11165119 -0.12182801
  -0.04682196]]
