# Neural Networks

In [1]:
import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

In [3]:
# Load the data
df = pd.read_csv('BankRecords.csv')
df.head()

Unnamed: 0,ID,Age,Experience(Years),Income(Thousands's),Sort Code,Family,Credit Score,Education,Mortgage(Thousands's),Personal Loan,Securities Account,CD Account,Online Banking,CreditCard
0,1,25,1,49,91107,4,1.6,Diploma,0,No,Yes,No,No,No
1,2,45,19,34,90089,3,1.5,Diploma,0,No,Yes,No,No,No
2,3,39,15,11,94720,1,1.0,Diploma,0,No,No,No,No,No
3,4,35,9,100,94112,1,2.7,Degree,0,No,No,No,No,No
4,5,35,8,45,91330,4,1.0,Degree,0,No,No,No,No,Yes


## Data Preparation

In [4]:
# Renaming some of the columns to remove especial characters
df = df.rename(columns={
    'Experience(Years)': 'Experience_years',
    'Income(Thousands\'s)': 'Income_thousands',
    'Mortgage(Thousands\'s)' : 'Mortgage_thousands'
})

### Checking the dataframe's characteristics

In [5]:
df.shape

(5000, 14)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience_years    5000 non-null   int64  
 3   Income_thousands    5000 non-null   int64  
 4   Sort Code           5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   Credit Score        5000 non-null   float64
 7   Education           5000 non-null   object 
 8   Mortgage_thousands  5000 non-null   int64  
 9   Personal Loan       5000 non-null   object 
 10  Securities Account  5000 non-null   object 
 11  CD Account          5000 non-null   object 
 12  Online Banking      5000 non-null   object 
 13  CreditCard          5000 non-null   object 
dtypes: float64(1), int64(7), object(6)
memory usage: 547.0+ KB


In [7]:
df.describe()

Unnamed: 0,ID,Age,Experience_years,Income_thousands,Sort Code,Family,Credit Score,Mortgage_thousands
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937913,56.4988
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747666,101.713802
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,101.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,635.0


When checking the values of the numerical columns,  negative values can be observed in column 'Experience_years'. Negative values for experience are not realistic, for this reason they are going to be removed.

Double checking to confirm the existence of negative values.

In [69]:
# Select the numerical columns from the DataFrame
df_numerical = df[['Age', 'Experience_years', 'Income_thousands', 'Family', 'Credit Score', 'Mortgage_thousands']]

# Check for negative numbers in the selected numerical columns
negative_values = (df_numerical < 0)

# Check if there are any negative values in each column
print("\nColumns with negative values:\n", negative_values.any())

# If you want to get the exact locations of negative values
print("\nLocations of negative values:\n", df_numerical[df_numerical < 0].stack())



Columns with negative values:
 Age                   False
Experience_years       True
Income_thousands      False
Family                False
Credit Score          False
Mortgage_thousands    False
dtype: bool

Locations of negative values:
 89    Experience_years   -1.0
226   Experience_years   -1.0
315   Experience_years   -2.0
451   Experience_years   -2.0
524   Experience_years   -1.0
536   Experience_years   -1.0
540   Experience_years   -1.0
576   Experience_years   -1.0
583   Experience_years   -1.0
597   Experience_years   -2.0
649   Experience_years   -1.0
670   Experience_years   -1.0
686   Experience_years   -1.0
793   Experience_years   -2.0
889   Experience_years   -2.0
909   Experience_years   -1.0
1173  Experience_years   -1.0
1428  Experience_years   -1.0
1522  Experience_years   -1.0
1905  Experience_years   -1.0
2102  Experience_years   -1.0
2430  Experience_years   -1.0
2466  Experience_years   -2.0
2545  Experience_years   -1.0
2618  Experience_years   -3.0
2717  

### Dropping unnecessary columns
Features 'ID' and 'Sort Code' don't provide any meaningful relationship with the target variable and will be removed to reduce noise introduced into the model. All other features will be used in the model.

In [8]:
df.drop(columns=['ID', 'Sort Code'], inplace=True)

### Encoding categorical variables

This step transforms categories into a format that can be provided to the model.

In [9]:
# Encode categorical values using dummies
categorical_columns = ['Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online Banking', 'CreditCard']
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [10]:
from sklearn.model_selection import train_test_split

# Define the feature matrix X and the target vector y
X = df_encoded.drop('Income_thousands', axis=1)
y = df_encoded['Income_thousands']

# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


### Scaling features
Standardazing them to have a mean of 0 and a standard deviation of 1.
It Helps in speeding up training and improves model performance.


In [11]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data
X_test_scaled = scaler.transform(X_test)


In [14]:
X_train_scaled.shape

(4000, 12)

## Neural Network Model

The input layer has as many neurons as there are features in the dataset, ensuring each feature is represented. The first hidden layer has 64 neurons with a 'ReLU' activation function. The second hidden layer has 32 neurons with a 'ReLU' activation function, so the model can learn more complex patterns in the data.

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the neural network model
model_nn = Sequential()
model_nn.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model_nn.add(Dense(32, activation='relu'))
model_nn.add(Dense(1))

# Compile the model
model_nn.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Train the model
model_nn.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test), epochs=100, batch_size=32)

# Evaluate the neural network model
mae_nn = model_nn.evaluate(X_test_scaled, y_test)[1]


Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 6903.8813 - mean_absolute_error: 69.7544 - val_loss: 3734.2634 - val_mean_absolute_error: 47.9405
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2483.0386 - mean_absolute_error: 37.9947 - val_loss: 916.8630 - val_mean_absolute_error: 24.1640
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 930.8235 - mean_absolute_error: 24.3011 - val_loss: 874.4266 - val_mean_absolute_error: 23.6542
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 877.2695 - mean_absolute_error: 23.6607 - val_loss: 857.5135 - val_mean_absolute_error: 23.3572
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 862.7688 - mean_absolute_error: 23.2936 - val_loss: 846.2256 - val_mean_absolute_error: 23.3299
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━

## Random Forest (Regression Algorithm)

Random Forest is good for predicting customer income because it is robust and can handle complex, non-linear relationships. It provides insights into which features are important, works well with many features, and is flexible with different types of data. It also gives stable and reliable predictions.

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Initialize and train the Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred_rf = model_rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)


### Results

Random Forest model performs better than the Neural Network in predicting customer income. 
The Mean Absolute Error (MAE) for the Random Forest model is 13.94, which is lower than the MAE of 21.08 for the Neural Network. 
This indicates that the Random Forest model makes more accurate predictions, with a fewer errors if compared to the Neural Network.

In [17]:
#Display results
print(f'Mean Absolute Error (Neural Network): {mae_nn}')
print(f'Mean Absolute Error (Random Forest): {mae_rf}')


Mean Absolute Error (Neural Network): 21.084430694580078
Mean Absolute Error (Random Forest): 13.938956281746032


# Model Optimizations

## Neural Networks

In [19]:
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam, SGD, RMSprop

In [22]:
# Define a function to create the neural network model
def create_model(neurons=32, activation='relu', optimizer='adam',dropout_rate=0.0):
    model = Sequential()
    model.add(Dense(neurons, input_dim=X_train_scaled.shape[1], activation=activation))
    model.add(Dropout(dropout_rate))  # Add dropout to prevent overfitting
    model.add(Dense(neurons, activation=activation))
    model.add(Dropout(dropout_rate))  # Add dropout to prevent overfitting
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

# Wrap the model using KerasRegressor
model_nn2 = KerasRegressor(model=create_model, verbose=0)

# Define the parameter grid
param_grid_nn = {
    'batch_size': [16, 32],
    'epochs': [50, 100],
    'model__optimizer': ['adam'],
    'model__activation': ['relu'],
    'model__neurons': [32, 64]
}


In [23]:
# Perform Randomized Search for hyperparameter tuning
random_search_nn = RandomizedSearchCV(estimator=model_nn2, param_distributions=param_grid_nn, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=3, random_state=42)
random_result_nn = random_search_nn.fit(X_train_scaled, y_train)

# Display the best parameters from Randomized Search
print(f"Best NN: {random_result_nn.best_score_} using {random_result_nn.best_params_}")


Best NN: -21.419204104105177 using {'model__optimizer': 'adam', 'model__neurons': 64, 'model__activation': 'relu', 'epochs': 100, 'batch_size': 16}


In [24]:
# Train the best model
best_nn = random_result_nn.best_estimator_
best_nn.fit(X_train_scaled, y_train, epochs=random_result_nn.best_params_['epochs'], batch_size=random_result_nn.best_params_['batch_size'], validation_data=(X_test_scaled, y_test))

# Evaluate the neural network model
mae_nn = best_nn.score(X_test_scaled, y_test)
print(f'Mean Absolute Error (Neural Network): {mae_nn}')

Mean Absolute Error (Neural Network): 0.649674117565155


In [56]:
# Perform Grid Search for tuning
grid_search_nn = GridSearchCV(estimator=model_nn2, param_grid=param_grid_nn, scoring='neg_mean_absolute_error', n_jobs=-1, cv=3)
grid_result_nn = grid_search_nn.fit(X_train_scaled, y_train)

# Display the best parameters from grid search
print(f"Best NN: {grid_result_nn.best_score_} using {grid_result_nn.best_params_}")


Best NN: -21.21241191878715 using {'batch_size': 32, 'epochs': 100, 'model__activation': 'relu', 'model__neurons': 64, 'model__optimizer': 'adam'}


## Random Forest

In [None]:
# Define the parameter grid for random forest model
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Perform Randomized Search for random forest model
random_search_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=param_grid_rf, n_iter=20, scoring='neg_mean_absolute_error', n_jobs=-1, cv=3, random_state=42)
random_result_rf = random_search_rf.fit(X_train, y_train)

# Display the best parameters from randomized search for random forest model
print(f"Best RF: {random_result_rf.best_score_} using {random_result_rf.best_params_}")

## New customers predictions

In [54]:
import random

# Function to generate random customer data
def generate_customer():
    return {
        'Age': random.randint(18, 80),
        'Experience_years': random.randint(1, 50),
        'Family': random.randint(1, 5),
        'Credit Score': round(random.uniform(0.5, 4.0), 2),
        'Education': random.choice(['Diploma', 'Degree', 'Masters']),
        'Mortgage_thousands': 0 if random.random() < 0.4 else random.randint(1, 500),
        'Personal Loan': random.choice(['Yes', 'No']),
        'Securities Account': random.choice(['Yes', 'No']),
        'CD Account': random.choice(['Yes', 'No']),
        'Online Banking': random.choice(['Yes', 'No']),
        'CreditCard': random.choice(['Yes', 'No'])
    }

In [55]:
# Generate 10 random new customers
new_customers = [generate_customer() for _ in range(10)]

# Convert new customers to DataFrame
new_customers_df = pd.DataFrame(new_customers)

# Encode categorical variables
categorical_columns = ['Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online Banking', 'CreditCard']
new_customers_encoded = pd.get_dummies(new_customers_df, columns=categorical_columns, drop_first=True)

# Ensure all columns are present
new_customers_encoded = new_customers_encoded.reindex(columns=X.columns, fill_value=0)

# Scale features
new_customers_scaled = scaler.transform(new_customers_encoded)

# Make predictions using the improved neural network model
predicted_incomes = best_nn.predict(new_customers_scaled)

# Add predicted incomes to the new customers DataFrame
new_customers_df['Predicted Income'] = predicted_incomes

# Display the new customers with their predicted incomes
new_customers_df


Unnamed: 0,Age,Experience_years,Family,Credit Score,Education,Mortgage_thousands,Personal Loan,Securities Account,CD Account,Online Banking,CreditCard,Predicted Income
0,80,4,1,3.45,Degree,0,No,No,Yes,No,Yes,113.887711
1,41,28,3,1.01,Degree,0,Yes,No,Yes,No,No,150.244446
2,75,39,2,1.77,Degree,216,No,No,No,No,No,65.829163
3,53,49,3,1.06,Diploma,0,No,Yes,No,No,No,54.433525
4,35,49,1,1.71,Diploma,277,No,Yes,No,No,No,71.219551
5,54,39,2,1.65,Masters,153,Yes,Yes,Yes,Yes,No,143.066864
6,33,39,5,2.47,Masters,0,Yes,Yes,No,Yes,Yes,125.179474
7,68,26,1,0.51,Diploma,379,Yes,No,No,No,No,147.595779
8,36,12,2,3.59,Diploma,98,Yes,No,No,Yes,Yes,81.169662
9,38,50,1,1.58,Diploma,414,Yes,No,Yes,No,No,121.551666


# Sentiment Analysis - Game of thrones Tweets

In [None]:
import re
from transformers import pipeline
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load full dataset with selected columns
df_full = pd.read_csv('gotTwitter.csv', usecols=['text', 'created_at', 'screen_name'])


In [None]:
# Select a random subset of 2000 rows
df_got = df_full.sample(n=1000, random_state=35).reset_index(drop=True)

# Display the first few rows of the random subset
df_got.head()


In [None]:
# Function to clean the tweet text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

# Apply the cleaning function to the tweet text
df_got['cleaned_text'] = df_got['text'].apply(clean_text)

In [None]:
# Load the sentiment analysis pipeline with a specified model
sentiment_pipeline = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment')


In [None]:
# Function to map model output to positive, neutral, and negative sentiments
def map_sentiment(label):
    score = int(label.split()[0])  # Extract the score from the label (e.g., '3 stars')
    if score == 1 or score == 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

# Apply the sentiment analysis pipeline to the cleaned text with progress bar
tqdm.pandas()
df_got['sentiment'] = df_got['cleaned_text'].progress_apply(lambda x: map_sentiment(sentiment_pipeline(x)[0]['label']))


In [None]:
# Display the first few sentiment results
df_got[['cleaned_text', 'sentiment']].head()

## Barchart of amount ot tweets per sentiment

In [None]:
# Count the number of each sentiment
sentiment_counts = df_got['sentiment'].value_counts()

# Plot the sentiment distribution
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=['#ff9999','#66b3ff','#99ff99'])
plt.title('Sentiment Distribution of Game of Thrones Tweets')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()


## Piechart of sentiment distribution

In [None]:
# Count the number of each sentiment
sentiment_counts = df_got['sentiment'].value_counts()

# Plot the pie chart
plt.figure(figsize=(8, 8))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99'])
plt.title('Sentiment Distribution of Game of Thrones Tweets')
# plt.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle.
plt.show()


## Wordcloud

In [None]:
# !pip install wordcloud

In [None]:
from wordcloud import WordCloud

# Function to generate word cloud
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=20)
    plt.axis('off')
    plt.show()

# Generate word clouds for different sentiments
positive_text = ' '.join(df_got[df_got['sentiment'] == 'positive']['cleaned_text'])
neutral_text = ' '.join(df_got[df_got['sentiment'] == 'neutral']['cleaned_text'])
negative_text = ' '.join(df_got[df_got['sentiment'] == 'negative']['cleaned_text'])

generate_wordcloud(positive_text, 'Positive Sentiment Words')
generate_wordcloud(neutral_text, 'Neutral Sentiment Words')
generate_wordcloud(negative_text, 'Negative Sentiment Words')


## Sentiment over time

In [None]:
# Convert 'created_at' to datetime
df_got['created_at'] = pd.to_datetime(df_got['created_at'])

# Resample to get weekly sentiment counts
weekly_sentiment = df_got.set_index('created_at').resample('W')['sentiment'].value_counts().unstack().fillna(0)

# Plot the sentiment over time
weekly_sentiment.plot(kind='line', marker='o', figsize=(8, 4),color=['#ff9999','#66b3ff','#99ff99'])
plt.title('Sentiment Over Time')
plt.xlabel('Week')
plt.ylabel('Count')
plt.legend(title='Sentiment',bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
