In [None]:
import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-2.4.6'
spark_version = 'spark-2.4.7'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:10 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:14 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic/main Sources

In [None]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-10-03 19:47:39--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2020-10-03 19:47:41 (1.02 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [None]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://laurentvh-kickstarter.s3.amazonaws.com/ks-projects-201801.csv"
spark.sparkContext.addFile(url)
kick_df = spark.read.csv(SparkFiles.get("ks-projects-201801.csv"), sep=",", header=True, inferSchema=True)

# Show DataFrame
kick_df.show()

+----------+--------------------+--------------+-------------+--------+----------+---------+-------------------+--------+----------+-------+-------+-----------+----------------+-------------+
|        ID|                name|      category|main_category|currency|  deadline|     goal|           launched| pledged|     state|backers|country|usd pledged|usd_pledged_real|usd_goal_real|
+----------+--------------------+--------------+-------------+--------+----------+---------+-------------------+--------+----------+-------+-------+-----------+----------------+-------------+
|1000002330|The Songs of Adel...|        Poetry|   Publishing|     GBP|2015-10-09|  1000.00|2015-08-11 12:12:28|    0.00|    failed|      0|     GB|       0.00|            0.00|      1533.95|
|1000003930|Greeting From Ear...|Narrative Film| Film & Video|     USD|2017-11-01| 30000.00|2017-09-02 04:43:57| 2421.00|    failed|     15|     US|     100.00|         2421.00|     30000.00|
|1000004038|      Where is Hank?|Narrati

In [None]:
kick_df.dtypes

[('ID', 'int'),
 ('name', 'string'),
 ('category', 'string'),
 ('main_category', 'string'),
 ('currency', 'string'),
 ('deadline', 'string'),
 ('goal', 'string'),
 ('launched', 'string'),
 ('pledged', 'string'),
 ('state', 'string'),
 ('backers', 'string'),
 ('country', 'string'),
 ('usd pledged', 'string'),
 ('usd_pledged_real', 'string'),
 ('usd_goal_real', 'string')]

In [None]:
# Configure settings for RDS

mode = "append"
jdbc_url="jdbc:postgresql://kickstarter.c90yn2pvfvlh.us-east-2.rds.amazonaws.com:5432/postgres"
config = {"user":"postgres", 
          "password": "Laurent123!", 
          "driver":"org.postgresql.Driver"}

In [None]:
# Write DataFrame to mask table in RDS
kick_df.write.jdbc(url=jdbc_url, table='kickstarter', mode=mode, properties=config)

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
# Change the PySpark dataframes into Pandas dataframes
kick_df = kick_df.select("*").toPandas()

## Preprocessing for machine learning

We will inspect the data to see if there's any categorical variable or NA values that we need to drop. 
<pre>
# Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()
</pre>


In [None]:
kick_df_cat = kick_df.dtypes[kick_df.dtypes == "object"].index.tolist()

### I. Inspect whether we need bucketing of variables in categorical columns

<pre>
df[df_cat].nunique()
</pre>

In [None]:
kick_df[kick_df_cat].nunique()

name                375729
category              1441
main_category          319
currency               101
deadline              3207
goal                  9204
launched            377181
pledged              63122
state                  934
backers               4129
country                226
usd pledged          95383
usd_pledged_real    106051
usd_goal_real        50734
dtype: int64

### II. Encode the categorical variables
First, we will inspect each column on NA values and whether we need to bucket any of the values together in each categorical column.
Bucketing them if needed
<pre>
# Print out each Category value counts of a categorical column
cate_counts = df.ColumnName.value_counts()

# Visualize the value counts
cate_counts.plot.density()

# Determine which values to replace 
replace = list(cate_counts[cate_counts < #].index)

# Replace in DataFrame
for value in replace:
    df.ColumnName = df.ColumnName.replace(value, "Bucket")
# Check to make sure binning was successful
df.ColumnName.value_counts()
</pre>

Then, move onto encoding the categorical columns. OR just skip to this part if bucketing is unncessary. 
<pre>
# Create a OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_ColumnName.values.resape(-1,1)))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(['ColumnName'])
encode_df.head()

</pre>

Finally, merge the encoded DataFrame with the original df and drop the original columns.

<pre>
df.merge(encode_df, left_index=True, right_index=True).drop("ColumnName", 1)
</pre>

### III. Decide on columns to drop
Inspect null values in each column and decide whether it's worth to drop or fill NA with 0s if needed. 
Then, we will drop columns that are not adding valuable information such as "name" and id" columns in the kick_df. 

### IV. Trial and Error for Machine Learning models: Random Forests versus Neural Networks
Since we will be developing a model that can identify whether a project will success with the funding, we will be using a binary classification. 

To get started we will define features and the output.
<pre>
# Split our preprocessed data into our features and target arrays
y = new_df["OutputColumn"].values
X = new_df.drop(["OutputColumn"],1).values

For our dataset, the output column will be "state" column.
# Split the data into testing and training dataset before standardizing the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
</pre>

### VI. Standardize the data

<pre>
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
</pre>

### VII. Random Forests
<pre>
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

### VIII. Support Vector Machine 
<pre>
# Create the SVM model
svm = SVC(kernel='sigmoid')
# Train the model
svm.fit(X_train, y_train)
# Evaluate the model
y_pred= svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")


### IX. Neural Networks
<pre>
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8 # number of neurons will change depending on the nature of the dataset (2-3 times the number of inputs)
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train,y_train,epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")