In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Linear Regression example for opened_inside_year:

data = pd.read_csv('processed_dataset.csv')

# Drop non-feature columns from the DataFrame
X = data.drop(columns=['opened_inside_year','carried_over_from','finished','carried_over_to'])

# Separate the target variable
y = data['opened_inside_year']

# Split data into training and testing sets
# First 81 row is the year 2021
X_train, X_test, y_train, y_test = X.iloc[81:], X.iloc[:81], y.iloc[81:], y.iloc[:81]

# Preprocessing
# One-hot encode the city column
# Scale the population column
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['city']),
        ('pop_scaler', RobustScaler(), ['population']),
        ('density_scaler', RobustScaler(), ['population_density'])
    ]
)

# Define the model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Print predicted and real y-values for the first 10 rows
for i in range(10):
    print("Predicted:", y_pred[i], "\tReal:", y_test.iloc[i])

# Calculate absolute percentage error for each prediction
absolute_percentage_errors = np.abs((y_test - y_pred) / y_test)

# Calculate mean absolute percentage error
mape = np.mean(absolute_percentage_errors)

# Convert MAPE to accuracy (accuracy = 1 - MAPE)
accuracy = 1 - mape

# Convert accuracy to percentage
percentage_accuracy = accuracy * 100

print("Average Percentage Accuracy:", percentage_accuracy)

Predicted: 742599.0428771579 	Real: 841644
Predicted: 56942.487488780076 	Real: 67719
Predicted: 24002.33920980757 	Real: 25475
Predicted: 17081.806148229225 	Real: 18984
Predicted: 64681.8112739348 	Real: 73542
Predicted: 29381.437749240282 	Real: 33447
Predicted: 241254.1137392953 	Real: 305541
Predicted: 72443.3131129292 	Real: 87538
Predicted: 56220.25231355736 	Real: 59477
Predicted: 71668.69689046004 	Real: 83130
Average Percentage Accuracy: 84.88081242370836


# Polynomial Regression from Scratch

In [10]:
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

class PolynomialRegression:
    def __init__(self, degree=2, method='normal_equation', lr=0.001, iterations=10000):
        self.degree = degree
        self.method = method
        self.lr = lr
        self.iterations = iterations
        self.weights = None

    def create_polynomial_features(self, X):
        poly_features = [X]
        for d in range(2, self.degree + 1):
            poly_features.append(X.pow(d))
        return torch.cat(poly_features, dim=1)

    def normal_equation(self, X, y):
        XTX_inv = torch.inverse(X.T.mm(X))
        XTy = X.T.mm(y)
        weights = XTX_inv.mm(XTy)
        return weights

    def gradient_descent(self, X, y):
        weights = torch.ones((X.shape[1], 1), dtype=torch.float32)
        n = X.shape[0]
        for _ in range(self.iterations):
            y_pred = X.mm(weights)
            grad = (2/n) * X.T.mm(y_pred - y)
            weights -= self.lr * grad
        return weights

    def fit(self, X, y):
        X_poly = self.create_polynomial_features(X)
        if self.method == 'normal_equation':
            self.weights = self.normal_equation(X_poly, y)
        elif self.method == 'gradient_descent':
            self.weights = self.gradient_descent(X_poly, y)
        else:
            raise ValueError("Method must be 'normal_equation' or 'gradient_descent'")

    def predict(self, X):
        X_poly = self.create_polynomial_features(X)
        return X_poly.mm(self.weights)

# Verileri yükleme ve hazırlama
data = pd.read_csv('processed_dataset.csv')

# Kategorik değişkeni sayısal değere dönüştürme
le = LabelEncoder()
data['city'] = le.fit_transform(data['city'])

# Eğitim ve test setlerine ayırma (2019 sonrası test)
train = data[data['time_since_event'] > 5]
test = data[data['time_since_event'] <= 5]

X_train = train.drop(columns=['opened_inside_year', 'carried_over_from', 'finished', 'carried_over_to'])
y_train = train['opened_inside_year']

X_test = test.drop(columns=['opened_inside_year', 'carried_over_from', 'finished', 'carried_over_to'])
y_test = test['opened_inside_year']


# Verileri torch tensörlerine dönüştürme
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float64)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float64).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float64)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float64).unsqueeze(1)



# Normal Equation ile model oluşturma ve eğitme
model_ne = PolynomialRegression(degree=4, method='normal_equation')
model_ne.fit(X_train_tensor, y_train_tensor)

# # Gradient Descent ile model oluşturma ve eğitme (valuelar nan oluyor düzeltemedim)
# model_gd = PolynomialRegression(degree=4, method='gradient_descent', lr=0.001, iterations=10000)
# model_gd.fit(X_train_tensor, y_train_tensor)

# Tahminler
y_pred_ne_test = model_ne.predict(X_test_tensor).detach().numpy()
# y_pred_gd_test = model_gd.predict(X_test_tensor).detach().numpy()
y_test_numpy = y_test_tensor.numpy()

# İlk 10 tahmin ve gerçek değerleri yazdırma
print("Normal Equation Predictions vs Real Values")
for i in range(10):
    print("Predicted:", y_pred_ne_test[i][0], "\tReal:", y_test_numpy[i][0])

# print("\nGradient Descent Predictions vs Real Values")
# for i in range(10):
#     print("Predicted:", y_pred_gd_test[i][0], "\tReal:", y_test_numpy[i][0])

# Hata hesaplamaları
def calculate_accuracy(y_true, y_pred):
    absolute_percentage_errors = np.abs((y_true - y_pred) / y_true)
    mape = np.mean(absolute_percentage_errors)
    accuracy = 1 - mape
    percentage_accuracy = accuracy * 100
    return percentage_accuracy

accuracy_ne = calculate_accuracy(y_test_numpy, y_pred_ne_test)
# accuracy_gd = calculate_accuracy(y_test_numpy, y_pred_gd_test)

print("\nAverage Percentage Accuracy (Normal Equation):", accuracy_ne)
# print("Average Percentage Accuracy (Gradient Descent):", accuracy_gd)


Normal Equation Predictions vs Real Values
Predicted: 864912.8298552418 	Real: 841644.0
Predicted: 68235.5469725392 	Real: 67719.0
Predicted: 28364.05290565998 	Real: 25475.0
Predicted: 20727.002287704276 	Real: 18984.0
Predicted: 73635.74326354462 	Real: 73542.0
Predicted: 33332.018968291624 	Real: 33447.0
Predicted: 271277.01123647403 	Real: 305541.0
Predicted: 88695.27612071109 	Real: 87538.0
Predicted: 58800.478365561954 	Real: 59477.0
Predicted: 75730.16436017047 	Real: 83130.0

Average Percentage Accuracy (Normal Equation): 86.95865546522951
Average Percentage Accuracy (Gradient Descent): nan
