In [24]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [25]:
# Loading data
df = pd.read_csv('retail_socio.csv')
df.drop(['the_geom', 'industry', 'do_date_x', 'do_date_y', 'geoid_x', 'geoid_y'], axis=1, inplace = True)
df.head()

Unnamed: 0.1,Unnamed: 0,region_id,txn_amt,in_grades_5_to_8,families_with_young_children,four_more_cars,pop_determined_poverty_status,sales_office_employed,hispanic_any_race,owner_occupied_housing_units_upper_value_quartile,...,asian_pop,amerindian_pop,dwellings_50_or_more_units,male_5_to_9,male_80_to_84,female_10_to_14,female_21,asian_male_55_64,commuters_by_car_truck_van,do_area
0,2,36,0.0,1980.0,5732.0,42.0,141601.0,18776.0,12094.0,1118800.0,...,22871.0,200.0,71996.0,1562.0,1242.0,1034.0,1604.0,762.0,6174.0,4194152.0
1,5,36,152.38,1980.0,5732.0,42.0,141601.0,18776.0,12094.0,1118800.0,...,22871.0,200.0,71996.0,1562.0,1242.0,1034.0,1604.0,762.0,6174.0,4194152.0
2,7,36,124.44,1980.0,5732.0,42.0,141601.0,18776.0,12094.0,1118800.0,...,22871.0,200.0,71996.0,1562.0,1242.0,1034.0,1604.0,762.0,6174.0,4194152.0
3,18,36,50.18,,,,,,,,...,,,,,,,,,,
4,19,36,46.06,,,,,,,,...,,,,,,,,,,


In [26]:
df.dropna(inplace=True)

## Preprocessing Process:
1. Define the features set
2. Define the target set
3. Split into training and testing sets
4. Create a StandardScaler instance
5. Fit the StandardScaler
6. Scale the data

In [27]:
# Define the features set.
X = df.copy()
X = X.drop("txn_amt", axis=1)
X.head()

Unnamed: 0.1,Unnamed: 0,region_id,in_grades_5_to_8,families_with_young_children,four_more_cars,pop_determined_poverty_status,sales_office_employed,hispanic_any_race,owner_occupied_housing_units_upper_value_quartile,employed_science_management_admin_waste,...,asian_pop,amerindian_pop,dwellings_50_or_more_units,male_5_to_9,male_80_to_84,female_10_to_14,female_21,asian_male_55_64,commuters_by_car_truck_van,do_area
0,2,36,1980.0,5732.0,42.0,141601.0,18776.0,12094.0,1118800.0,23582.0,...,22871.0,200.0,71996.0,1562.0,1242.0,1034.0,1604.0,762.0,6174.0,4194152.0
1,5,36,1980.0,5732.0,42.0,141601.0,18776.0,12094.0,1118800.0,23582.0,...,22871.0,200.0,71996.0,1562.0,1242.0,1034.0,1604.0,762.0,6174.0,4194152.0
2,7,36,1980.0,5732.0,42.0,141601.0,18776.0,12094.0,1118800.0,23582.0,...,22871.0,200.0,71996.0,1562.0,1242.0,1034.0,1604.0,762.0,6174.0,4194152.0
9,30,36,1980.0,5732.0,42.0,141601.0,18776.0,12094.0,1118800.0,23582.0,...,22871.0,200.0,71996.0,1562.0,1242.0,1034.0,1604.0,762.0,6174.0,4194152.0
10,35,36,1980.0,5732.0,42.0,141601.0,18776.0,12094.0,1118800.0,23582.0,...,22871.0,200.0,71996.0,1562.0,1242.0,1034.0,1604.0,762.0,6174.0,4194152.0


In [28]:
# Define the target set.
# the ravel() method performs the 
# same procedure on our target set data as the values attribute.
y = df["txn_amt"].ravel()
y[:5]

array([  0.  , 152.38, 124.44,   0.  ,   0.  ])

In [29]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [30]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

In [31]:
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [32]:
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
# Create a random forest classifier.
# n_estimators will allow us to set the number of trees that will be created by the algorithm
# The best practice is to use between 64 and 128 random forests, though higher numbers are quite common despite the higher training time.
rf_model = RandomForestRegressor(n_estimators=500, random_state=78) 

In [34]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [35]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [45]:
# Calculating the accuracy score.
score_train = rf_model.score(X_train_scaled, y_train)
score_train

0.9049745514495331

In [46]:
score_test = rf_model.score(X_test_scaled, y_test)
score_test

0.30684891553075744