In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced


# Read the CSV and Perform Basic Data Cleaning

In [4]:
columns = ['date', 'text', 'weight', 'negative', 'neutral', 'positive']
target = ['sentiment']

In [4]:
# we will need to use pyspark to connect to our joined_v001.csv file in AWS S3
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()


0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [0% [1 InRelease gpgv 242 kB] [Waiting for headers] [Connecting to security.ubun                                                                               Get:2 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
0% [1 InRelease gpgv 242 kB] [2 InRelease 14.2 kB/88.7 kB 16%] [Connecting to s                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 http://archive.ubuntu.com/ubuntu bionic-b

In [5]:
# Download a Postgres driver that will allow Spark to interact with Postgres
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-08-24 04:47:29--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-08-24 04:47:29 (9.76 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [6]:
# Add drivers to Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TwitterEnsemble").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [7]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://jessehernandez70test1.s3.us-west-1.amazonaws.com/joined_v001.csv"

spark.sparkContext.addFile(url)
joined_data_df = spark.read.csv(SparkFiles.get("joined_v001.csv"), sep=",", header=True, inferSchema=True)


# Show DataFrame
joined_data_df.show()

+----------+--------------------+-----------+-----+------+
|      date|                text|   negative|event|weight|
+----------+--------------------+-----------+-----+------+
|2022-08-01|the conflict is b...|0.602711678| null|  null|
|2022-08-01|remember when sma...|0.212250158| null|  null|
|2022-08-01|hi  you have to u...|0.490129501| null|  null|
|2022-08-01|the world is in d...|0.876022816| null|  null|
|2022-08-01|will the imposed ...|0.044509333| null|  null|
|2022-08-01|owner of ukraines...| 0.83338201| null|  null|
|2022-08-01|animal shelter do...|0.013312108| null|  null|
|2022-08-01|    eb5f211900p4acc7|0.254985869| null|  null|
|2022-08-01|why  relations li...|0.409764558| null|  null|
|2022-08-01|crypto tattoos to...|0.005395876| null|  null|
|2022-08-01|its happening on ...|0.061495773| null|  null|
|2022-08-01|will certainly lo...|0.881493032| null|  null|
|2022-08-01|serbian vucic is ...|0.974542975| null|  null|
|2022-08-01|animal shelter do...|0.013312108| null|  nul

In [None]:
# Load the data
file_path = Path("Resources/LoanStats_2019Q1.csv")
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()


In [None]:
# Remove the `Issued` loan status
issued_mask = df['sentiment'] != 'Issued'
df = df.loc[issued_mask]

# convert text to text lenght
df["text lengh"] = df["text"].str.len()

# convert Negative and Positive to Sentiment

# convert datetime to numerical?
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values sentiment to negative and positive based on their values
x = {'Current': 'sentiment'}   
df = df.replace(x)

x = dict.fromkeys(['negative', 'neutral', 'positive'], 'sentiment')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

# Split the Data into Training and Testing

In [None]:
# Create our features
X = pd.get_dummies(df.drop(columns='positive'))

# Create our target
y = df['positive']

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

Train the model using the training data.
Calculate the balanced accuracy score from sklearn.metrics.
Print the confusion matrix from sklearn.metrics.
Generate a classication report using the imbalanced_classification_report from imbalanced-learn.
For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score
Note: Use a random state of 1 for each algorithm to ensure consistency between tests

# Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators =100, random_state=1).fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
priority_features = pd.Series(data=rf_model.feature_importances_,index=X.columns)
priority_features.sort_values(ascending=False,inplace=True)
print(priority_features)

# Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec_model = EasyEnsembleClassifier(n_estimators =100, random_state=1).fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = eec_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))