### Organize Data into CSV and Matrix

In [18]:
# imports
import pandas as pd
import numpy as np
import random

In [59]:
### Census Sales Data

df_nonadjust = pd.DataFrame()
df_label = pd.DataFrame()
xlfname = 'mrtssales92-present (1).xlsx'
xl = pd.ExcelFile(xlfname)

### Labels
df_label = xl.parse(xl.sheet_names[0], header=4, usecols='A:B',nrows = 66)
df_label = df_label.drop([0,1,2,3,4,5,6,7])
df_label = df_label.set_axis(["NAICS Code","Business Type"],axis=1)

### Not adjusted
for sheet in xl.sheet_names:
    df_tmp = pd.DataFrame()
    if sheet == '2023':
        df_tmp = xl.parse(sheet, header=4, usecols='C:K',nrows = 66)
    else:
        df_tmp = xl.parse(sheet, header=4,usecols='C:N',nrows = 66)
    df_tmp = df_tmp.drop([0,1,2,3,4,5,6,7])
    df_nonadjust = pd.concat([df_nonadjust,df_tmp], axis = 1, ignore_index=False,sort=False)

df_nonadjust = pd.concat([df_label,df_nonadjust], axis = 1, ignore_index=False,sort=False)


csvfile = 'CensusNonAdjusted.csv'
df_nonadjust.to_csv(csvfile, index=False)


# Need to organize this further and then get matrix with specific columns 

In [9]:
### Zillow Housing Price Data

df = pd.read_csv('State_time_series.csv',parse_dates=['Date'])
data = pd.DataFrame(zip(df.Date, df.Sale_Prices, df.ZHVI_AllHomes),
               columns =['Date', 'Sale_Prices','ZHVI'])
data1_drop = data.dropna(thresh=3)
data1_drop.reset_index(drop = True)

csvfile = 'Zillow.csv'
data1_drop.to_csv(csvfile, index=False)

Zillow_Data = data1_drop # specific columns to use 

In [10]:
### Hotel Booking Data

df = pd.read_csv('hotel_bookings_raw.csv',parse_dates=['reservation_status_date'])
data = pd.DataFrame(zip(df.reservation_status_date, df.stays_in_weekend_nights, df.stays_in_week_nights, df.INFLATION, df.GDP, df.FUEL_PRCS),
               columns =['Date', 'Weekend Nights','Week Nights', 'Inflation', 'GDP', "Fuel Prices"])
data1_drop = data.dropna()
data1_drop.reset_index(drop = True)

csvfile = 'Hotel.csv'
data1_drop.to_csv(csvfile, index=False)

Hotel_Data = data1_drop # specific columns to use 

In [12]:
### Coffee Sales Data

df = pd.read_csv('Coffee_domestic_consumption.csv')
data1_drop = df.dropna()
data1_drop.reset_index(drop = True)

csvfile = 'Coffee.csv'
data1_drop.to_csv(csvfile, index=False)

Coffee_Data = data1_drop # specific columns to use 

In [15]:
### Avocado Sales Data

df = pd.read_csv('avocado_sheet.csv')
data = pd.DataFrame(zip(df.AveragePrice, df.Total_Volume, df.Total_Bags, df.year, df.region),
               columns =['Price', 'Volume','Bags','Year',"Region"])
data1_drop = data.dropna()
data1_drop.reset_index(drop = True)

csvfile = 'Avocado.csv'
data1_drop.to_csv(csvfile, index=False)

Avocado_Data = data1_drop # specific columns to use 

### Normalize Data

In [None]:
# explicit function to normalize array
def normalize(arr, t_min, t_max):
    norm_arr = []
    diff = t_max - t_min
    diff_arr = max(arr) - min(arr)    
    for i in arr:
        temp = (((i - min(arr))*diff)/diff_arr) + t_min
        norm_arr.append(temp)
    return norm_arr

# What to normalize?
# Will need to organize data better, all are different 

### Initial Regression/Correlation Models

In [None]:
### Functions for different methods
def LinearReg(x,y):  # Code for regression using the Normal Equation (X.T*X)*theta = (X.T*y)
    X = np.hstack([np.ones((x.shape[0],1)),x])  # Concatenate x with a column of ones on the left
    theta=(np.linalg.solve(np.matmul(X.T,X),np.matmul(X.T,y)))
    return theta

def sigmoid(x):
    return 1/(1+np.exp(-x))

def grad_linear(theta,x,y): # Gradient for linear regression
    z= x.dot(theta)
    gradient = (1/len(x))*np.matmul(x.T,z-y)
    return gradient

def grad_linear_stoch (theta,x,y): # Gradient for stochastic linear regression
    z=x.dot(theta)
    gradient = (1/len(x))*x*(z-y)
    return gradient

def grad_logistic(theta,x,y): # Gradient for logistic regression
    z=x.dot(theta)
    gradient = (1/len(x))*np.matmul(x.T,sigmoid(z)-y)
    return gradient

def GradientDescent(x,y,theta,alpha,iteration,grad): # Code for gradient descent
    X = np.hstack([np.ones((x.shape[0],1)),x]) # Concatenate x with a column of ones on the left
    theta_list = [theta]
    for i in range(iteration):
        theta = theta - alpha*grad(theta,X,y)
        theta_list.append(theta)
    h = sigmoid(np.dot(X, theta))
    return theta, h, theta_list

def StochasticGD(x,y,theta,alpha,iteration,grad): # Code for stochastic gradient descent
    X = np.hstack([np.ones((x.shape[0],1)),x]) # Concatenate x with a column of ones on the left
    theta_list = [theta]
    for i in range(iteration):
        k=random.randint(1,x.shape[0]-1)
        theta = theta - alpha*grad(theta,X[k],y[k])
        theta_list.append(theta)
    return theta, theta_list

def newton_method(x, y, num_iterations):
    X = np.hstack([np.ones((x.shape[0],1)),x])  # Concatenate x with a column of ones on the left
    m, n = X.shape  
    theta = np.zeros(n)  # Initialize the parameters
    for _ in range(num_iterations):
        h = sigmoid(np.dot(X, theta))
        gradient = np.dot(X.T,(h-y))/m  # Calculate the gradient and Hessian
        diagonal = np.diag(h*(1-h))
        hessian = (1/m)*np.dot(X.T, np.dot(diagonal,X))
        theta = theta - np.dot(np.linalg.inv(hessian), gradient) # Update theta using Newton's method
    return theta, sigmoid(np.dot(X, theta))