In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading & Preprocessing 

In [2]:
# Import the Google Search Trend Data for the Stock Ticker into a Pandas DataFrame
carvana_search_df = pd.read_csv(Path('Carvana_Trends.csv'))

# Review the DataFrame for each of the Google Search Trends Data sets
carvana_search_df.head()

Unnamed: 0,Week,Carvana Co,Carvana stock down,Carvana bankruptcy,Carvana short,Carvana loss,Trending
0,7/26/2020,41,0,4,0,0,0
1,8/2/2020,43,0,0,24,24,0
2,8/9/2020,43,0,12,0,84,1
3,8/16/2020,40,0,0,0,0,0
4,8/23/2020,37,0,11,8,38,0


## Define the Features 

In [3]:
# Define the features set by copying the Google Search Trend DataFrame
X = carvana_search_df.copy()
X.drop(columns=['Trending', 'Week'], axis=1, inplace=True)
# Review the features set of the carvana_search_df 
X.head()

Unnamed: 0,Carvana Co,Carvana stock down,Carvana bankruptcy,Carvana short,Carvana loss
0,41,0,4,0,0
1,43,0,0,24,24
2,43,0,12,0,84
3,40,0,0,0,0
4,37,0,11,8,38


## Define the Targets

In [4]:
# Create the target vector by assiging the values of the ['Column'] from the carvana_search_df
y = carvana_search_df['Trending'].values.reshape(-1,1)

# Review the targets set of the carvana_search_df
y[:5]

array([[0],
       [0],
       [1],
       [0],
       [0]], dtype=int64)

## Train, Test, & Split the Features and Targets

In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=70)

## Use the StandardScaler to Scale the Features Data

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [7]:
# Fit the Standard Scaler with the Training Data
X_scaler = scaler.fit(X_train)

In [8]:
# Scale the Training Data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

Once the data is scaled, create a random forest instance and train it with the training data (X_train_scaled and y_train).

In [15]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=70)

In [17]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  


## Making Predictions Using the Random Forest Model

Validate the trained model, by predicting loan defaults using the testing data (X_test_scaled).

In [18]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Model Evaluation

Evaluate model's results, by using sklearn to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [19]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
      cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [20]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24,1
Actual 1,3,11


Accuracy Score : 0.8974358974358975
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.96      0.92        25
           1       0.92      0.79      0.85        14

    accuracy                           0.90        39
   macro avg       0.90      0.87      0.88        39
weighted avg       0.90      0.90      0.90        39



## Feature Importance

In this section, you are asked to fetch the features' importance from the random forest model and display the top 10 most important features.

In [21]:
# Get the feature importance array
importances = rf_model.feature_importances_

In [22]:
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:]

[(0.6185073171772, 'Carvana loss'),
 (0.13980904536318062, 'Carvana short'),
 (0.10579284746738823, 'Carvana Co'),
 (0.06875805585407459, 'Carvana stock down'),
 (0.06713273413815661, 'Carvana bankruptcy')]