In [56]:
# import dependencies
# May have more of less than we need here. (to be updated later)
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
import numpy as np

In [2]:
# Loading data
# path may be different than mine
file_path = Path("Resources/renamed_tornado_data.csv")
tornado_df = pd.read_csv(file_path)
print(tornado_df.shape)
tornado_df.head()


(67558, 14)


Unnamed: 0,Year,Month,Day,Date,State,Magnitude,Injuries,Fatalities,Start Lat,Start Long,Ending Lat,Ending Long,Length_in_Miles,Width_in_Yards
0,1950,1,3,1950-01-03,IL,3,3,0,39.1,-89.3,39.12,-89.23,3.6,130
1,1950,1,3,1950-01-03,MO,3,3,0,38.77,-90.22,38.83,-90.03,9.5,150
2,1950,1,3,1950-01-03,OH,1,1,0,40.88,-84.58,0.0,0.0,0.1,10
3,1950,1,13,1950-01-13,AR,3,1,1,34.4,-94.37,0.0,0.0,0.6,17
4,1950,1,25,1950-01-25,IL,2,0,0,41.17,-87.33,0.0,0.0,0.1,100


In [3]:
# look for unique values
tornado_df.nunique()

Year                  72
Month                 12
Day                   31
Date               12300
State                 53
Magnitude              7
Injuries             209
Fatalities            50
Start Lat          14215
Start Long         16024
Ending Lat         15043
Ending Long        16571
Length_in_Miles     2429
Width_in_Yards       405
dtype: int64

In [4]:
# check unique vales per column and decide if we want to replace any
count=tornado_df.Length_in_Miles.value_counts()
count[:10]

0.10     15664
0.50      4950
1.00      4666
0.20      4542
2.00      2775
0.30      2477
3.00      1171
1.50      1138
5.00       753
4.00       703
0.80       604
2.50       533
6.00       464
0.01       463
0.40       450
8.00       406
0.70       382
7.00       379
10.00      373
Name: Length_in_Miles, dtype: int64

In [5]:
# how many rows and columns
print(tornado_df.shape)

(67558, 14)


In [6]:
# Check types
tornado_df.dtypes

Year                 int64
Month                int64
Day                  int64
Date                object
State               object
Magnitude            int64
Injuries             int64
Fatalities           int64
Start Lat          float64
Start Long         float64
Ending Lat         float64
Ending Long        float64
Length_in_Miles    float64
Width_in_Yards       int64
dtype: object

In [7]:
# tornado_df = tornado_df.astype({'Start Lat': 'int64', 'Start Long': 'int64','Ending Lat': 'int64',
#                                 'Ending Long': 'int64', 'Length_in_Miles': 'int64'})
# tornado_df.dtypes

In [8]:
# Look for null values
for column in tornado_df.columns:
    print(f"Column {column} has {tornado_df[column].isnull().sum()} null values")

Column Year has 0 null values
Column Month has 0 null values
Column Day has 0 null values
Column Date has 0 null values
Column State has 0 null values
Column Magnitude has 0 null values
Column Injuries has 0 null values
Column Fatalities has 0 null values
Column Start Lat has 0 null values
Column Start Long has 0 null values
Column Ending Lat has 0 null values
Column Ending Long has 0 null values
Column Length_in_Miles has 0 null values
Column Width_in_Yards has 0 null values


In [9]:
# look for duplicate rows
# Have fun playing with this. Tons and tons of duplicates! We will keep them all bc tornados happen everywhere all the time
print(f"Duplicate entries: {tornado_df[('Year')].duplicated().sum()}")

Duplicate entries: 67486


In [10]:
# drop columns if needed
# inplace = True means we are using this database instead of reassigning data base
tornado_df.drop(columns =["Year", "Month", "Day", "Date", "State", "Ending Lat", "Ending Long"], inplace=True)
tornado_df.head()

Unnamed: 0,Magnitude,Injuries,Fatalities,Start Lat,Start Long,Length_in_Miles,Width_in_Yards
0,3,3,0,39.1,-89.3,3.6,130
1,3,3,0,38.77,-90.22,9.5,150
2,1,1,0,40.88,-84.58,0.1,10
3,3,1,1,34.4,-94.37,0.6,17
4,2,0,0,41.17,-87.33,0.1,100


In [11]:
# Create target(y) and features(X) arrays
y = tornado_df["Injuries"].values
X = tornado_df.drop(["Injuries"],1).values

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
# Split into our training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=44)

In [13]:
# create scaler, fit, transform training and testing sets
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# create model
rfr_model = RandomForestRegressor(n_estimators=110, random_state=1)

In [15]:
# Fit model
rfr_model = rfr_model.fit(X_train_scaled, y_train)

In [16]:
# Evaluate model
pred = rfr_model.predict(X_test_scaled)
# print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")
print(pred)

[1.3        0.         0.05454545 ... 1.2        0.07272727 1.14545455]


In [34]:
# Define the model
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 48
hidden_nodes_layer2 = 36
hidden_nodes_layer3 = 24
hidden_nodes_layer4 = 12
hidden_nodes_layer5 = 10
hidden_nodes_layer6 = 8

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="tanh"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))

# Fifth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="tanh"))

# sixth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer6, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile model
nn.compile(loss="mean_absolute_error", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
528/528 - 1s - loss: 1.5023 - accuracy: 0.8721 - 528ms/epoch - 999us/step
Loss: 1.502312421798706, Accuracy: 0.8720544576644897


In [None]:
# Linear Regression Stuff below :)

In [53]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

LinearRegression()

In [50]:
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=6, random_state=44, noise=4, bias=100.0)

In [51]:
model.score(X,y)

-0.5695903206347475

In [61]:
training_data_prediction = model.predict(X_train_scaled)

In [62]:
error_score = metrics.r2_score(y_train, training_data_prediction)
print("R squared Error : ", error_score)

R squared Error :  0.5512472943680212
