## Basic Neural Network in Keras
We will use simple data of mobile price range classifier. The dataset consists of 20 features and we need to predict the price range in which phone lies. These ranges are divided into 4 classes.
<br>
Dataset - https://www.kaggle.com/iabhishekofficial/mobile-price-classification 


In [None]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:


# Dataset import
file_path = r"amazon.csv"
dataset = pd.read_csv(file_path, encoding="iso-8859-1")
dataset.head()

In [None]:


# Load the dataset
file_path = r"amazon.csv"
amazon_df = pd.read_csv(file_path, encoding="iso-8859-1")

# Display the first few rows of the dataset to understand its structure
amazon_df.head()


# Checking for missing values
missing_values = amazon_df.isnull().sum()

# Checking for duplicates
duplicates = amazon_df.duplicated().sum()

missing_values, duplicates


In [None]:
# Function to convert price and discount percentage columns to numeric
def convert_to_numeric(column):
    return pd.to_numeric(column.str.replace('[^0-9.]', '', regex=True))




In [None]:
# Convert 'discounted_price', 'actual_price', and 'discount_percentage' to numeric
amazon_df['discounted_price'] = convert_to_numeric(amazon_df['discounted_price'])
amazon_df['actual_price'] = convert_to_numeric(amazon_df['actual_price'])
amazon_df['discount_percentage'] = convert_to_numeric(amazon_df['discount_percentage'])



In [None]:
# Replace non-numeric entries in 'rating' with NaN and then convert to numeric
# You can also use 'mean', 'median', or any other statistical measure if you don't want to use NaN
amazon_df['rating'] = pd.to_numeric(amazon_df['rating'], errors='coerce')

# If you want to remove rows with non-numeric 'rating' instead of replacing with NaN
# amazon_df = amazon_df[pd.to_numeric(amazon_df['rating'], errors='coerce').notna()]

# Convert 'rating_count', handling commas and coercing errors
amazon_df['rating_count'] = pd.to_numeric(amazon_df['rating_count'].str.replace(',', ''), errors='coerce')

# Rechecking the data types after conversion
updated_data_types = amazon_df.dtypes

updated_data_types


In [None]:
# Identifying non-numeric entries in the 'rating' column

# Convert 'rating' column to string
amazon_df['rating'] = amazon_df['rating'].astype(str)

# Identifying non-numeric entries in the 'rating' column
non_numeric_ratings = amazon_df[~amazon_df['rating'].str.replace('.', '', regex=True).str.isnumeric()]['rating']

non_numeric_ratings.unique()



In [None]:
# Removing rows where 'rating' contains '|'
amazon_df = amazon_df[amazon_df['rating'] != '|']

# Converting 'rating' to numeric
amazon_df['rating'] = pd.to_numeric(amazon_df['rating'], errors='coerce')

# Recheck the data types after conversion
updated_data_types = amazon_df.dtypes

updated_data_types


In [None]:
# Remove rows with any missing values
amazon_df = amazon_df.dropna()

# Check the updated dataset for missing values
updated_missing_values = amazon_df.isnull().sum()
updated_missing_values


In [None]:
amazon_df.head()

In [None]:

categories = amazon_df['category'].str.split('|').explode()
unique_categories = categories.unique()


In [None]:
unique_categories

In [None]:
number_of_categories = len(unique_categories)
print(number_of_categories)


In [None]:
amazon_df.drop('about_product', axis=1, inplace=True)
amazon_df.drop('review_id', axis=1, inplace=True)
amazon_df.drop('user_id', axis=1, inplace=True)
amazon_df.drop('product_name', axis=1, inplace=True)
amazon_df.drop('user_name', axis=1, inplace=True)
# amazon_df['user_id'] = amazon_df['user_id'].astype('category')
amazon_df['product_id'] = amazon_df['product_id'].astype('category')



# amazon_df['user_id'] = amazon_df['user_id'].cat.codes
amazon_df['product_id'] = amazon_df['product_id'].cat.codes
amazon_df

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to apply sentiment analysis
def get_sentiment(text):
    # Ensure text is a string
    text = str(text)
    return sia.polarity_scores(text)['compound']

# Apply the sentiment analysis function to the text data column
# Replace 'review_content' with your actual text data column name
amazon_df['sentiment'] = amazon_df['review_content'].apply(get_sentiment)
amazon_df

In [None]:
amazon_df.drop('review_content', axis=1, inplace=True)
amazon_df

In [None]:
amazon_df.drop('review_title', axis=1, inplace=True)
amazon_df

In [None]:
amazon_df.drop('img_link', axis=1, inplace=True)
amazon_df.drop('product_link', axis=1, inplace=True)
amazon_df.drop('sales', axis=1, inplace=True)
amazon_df

In [None]:
amazon_df.fillna({
    'discounted_price': amazon_df['discounted_price'].mean(),
    'actual_price': amazon_df['actual_price'].mean(),
    'discount_percentage': amazon_df['discount_percentage'].mean(),
    'rating': amazon_df['rating'].mean(),
    'rating_count': amazon_df['rating_count'].mean()
}, inplace=True)


In [None]:
amazon_df['price_difference'] = amazon_df['actual_price'] - amazon_df['discounted_price']
amazon_df['has_discount'] = amazon_df['discount_percentage'].apply(lambda x: 1 if x > 0 else 0)
amazon_df['rating_level'] = pd.cut(amazon_df['rating'], bins=[0, 2, 4, 5], labels=[0, 1,2 ])


In [None]:
one_hot_encoded_data = amazon_df['category'].str.get_dummies(sep='|')

df = amazon_df.join(one_hot_encoded_data)

# 刪除原始的 'category' 列
df.drop('category', axis=1, inplace=True)
df

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count']  # Add other numerical columns if needed
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df


In [None]:

X = df.drop(columns=["rating","rating_count"]).values

y =  df[["rating","rating_count"]].values

In [None]:
 df[["rating","rating_count"]]

In [None]:
y

In [None]:
X

In [None]:
len(X[0])

In [None]:
len(y[0])

In [None]:


# Normalizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
print('Normalized data:')
print(X[0])

# One hot encode
# from sklearn.preprocessing import OneHotEncoder
# ohe = OneHotEncoder()
# y = ohe.fit_transform(y).toarray()
# print('One hot encoded array:')
# print(y[0:5])

# Train test split of model
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state = 0)

In [None]:
print(X_train)

In [None]:
X_train.dtype

In [None]:
import tensorflow as tf
model = keras.models.load_model('models/myhomeworkModel2.h5')

In [None]:
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
len(X[0])

In [None]:

model = Sequential()
model.add(Dense(len(X[0]), input_dim=len(X[0]), activation='relu'))  # Adjusted input_dim to match data shape
model.add(Dense(len(X[0])/2, activation='relu'))
model.add(Dense(len(X[0])/2, activation='relu'))
# model.add(Dense(len(X[0])/2, activation='relu'))
model.add(Dense(len(X[0])/4, activation='relu'))
# model.add(Dense(2, activation='softmax'))
model.add(Dense(2, activation='softmax'))
model.add(Dense(2, activation='softmax'))

In [None]:
# To visualize neural network
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=100, batch_size=64)

In [None]:
y_pred = model.predict(X_test)

# Converting predictions to label
pred = list()
for i in range(len(y_pred)):
    pred.append(np.argmax(y_pred[i]))

In [None]:
# Converting one hot encoded test label to label
test = list()
for i in range(len(y_test)):
    test.append(np.argmax(y_test[i]))

In [None]:
from sklearn.metrics import accuracy_score
a = accuracy_score(pred, test)
print('Accuracy is:', a*100)

In [None]:
# Using test data as validation data
history1 = model.fit(X_train, y_train,validation_data = (X_test,y_test), epochs=100, batch_size=64)

In [None]:
plt.plot(history1.history['accuracy'])
plt.plot(history1.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend([ 'Test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history1.history['accuracy'])
plt.plot(history1.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()

In [None]:
plt.plot(history1.history['accuracy'])
plt.plot(history1.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history1.history['loss'])
plt.plot(history1.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
model.save('models/myhomeworkModel2.h5')


In [None]:
# Assuming 'model' is your trained ANN model
# 'X_test' is your test dataset

# Create a copy for manipulation
X_test_modified = X_test.copy()

# Increase ratings in the test data
# Assuming 'rating' column index is known
X_test_modified[:, rating_col_index] += rating_increase  # e.g., increase by 0.5 or 1

# Predict sales with increased ratings
predicted_sales_with_higher_ratings = model.predict(X_test_modified)

# Calculate percentage change in sales
percentage_change = ((predicted_sales_with_higher_ratings - original_sales) / original_sales) * 100

# Print the average percentage change
average_percentage_change = np.mean(percentage_change)
print(f"Average estimated increase in sales: {average_percentage_change}%")
