In [1]:

# === Common Imports ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Make plots a bit larger
plt.rcParams["figure.figsize"] = (6,4)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import (r2_score, mean_squared_error, mean_absolute_error,
                             accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, roc_curve, auc, jaccard_score)
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA

# Optional: SciPy for dendrogram (only used if available)
try:
    from scipy.cluster.hierarchy import dendrogram, linkage
    SCIPY_OK = True
except Exception:
    SCIPY_OK = False

import os

RANDOM_STATE = 42

# === Small helpers ===
def ensure_csv_or_make(df, fname):
    # If fname exists, return pd.read_csv(fname); else save df to fname then read+return.
    if os.path.exists(fname):
        return pd.read_csv(fname)
    else:
        df.to_csv(fname, index=False)
        return df

def plot_actual_vs_pred(y_true, y_pred, title="Actual vs Predicted"):
    plt.figure()
    plt.scatter(y_true, y_pred, alpha=0.7)
    mn = min(y_true.min(), y_pred.min())
    mx = max(y_true.max(), y_pred.max())
    plt.plot([mn, mx], [mn, mx])
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(title)
    plt.show()

def print_regression_metrics(y_true, y_pred):
    print("R2:", r2_score(y_true, y_pred))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("RMSE:", mean_squared_error(y_true, y_pred, squared=False))

def plot_confusion_matrix_basic(cm, class_names=None, title="Confusion Matrix"):
    plt.figure()
    plt.imshow(cm, interpolation='nearest')
    plt.title(title)
    plt.colorbar()
    import numpy as _np
    tick_marks = _np.arange(cm.shape[0])
    if class_names is None:
        class_names = [str(i) for i in range(cm.shape[0])]
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     ha="center", va="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()


In [2]:

fname = "china_gdp.csv"
if not os.path.exists(fname):
    np.random.seed(RANDOM_STATE)
    years = np.arange(1960, 2015)
    t = (years - 1960)/55.0
    gdp = 1e2 + 1e4 / (1 + np.exp(-10*(t-0.6))) + np.random.normal(0, 200, size=len(years))
    df = pd.DataFrame({"Year": years, "GDP": gdp})
    df = ensure_csv_or_make(df, fname)

data = pd.read_csv(fname)
if "Year" in data.columns and "GDP" in data.columns:
    X = data[["Year"]].values
    y = data["GDP"].values
else:
    X = data.iloc[:, [0]].values
    y = data.iloc[:, -1].values

poly = PolynomialFeatures(degree=4, include_bias=False)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=RANDOM_STATE)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

print_regression_metrics(y_test, pred)

X_full_poly = poly.transform(X)
y_full_pred = lr.predict(X_full_poly)
plt.figure()
plt.scatter(X.flatten(), y, s=12, label="Actual")
plt.plot(X.flatten(), y_full_pred, label="Model", linewidth=2)
plt.xlabel("Year")
plt.ylabel("GDP")
plt.title("China GDP: Polynomial Regression Fit")
plt.legend()
plt.show()

plot_actual_vs_pred(y_test, pred, title="China GDP: Actual vs Predicted (Test)")


R2: 0.9880580319158478
MAE: 360.65220122090193


TypeError: got an unexpected keyword argument 'squared'