In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the dataset
auto = pd.read_csv('/content/Auto.csv', na_values='?').dropna()
X = auto.iloc[:, 1:-1]
y = auto['mpg']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
lm_mse = mean_squared_error(y_test, y_pred)
lm_coef = pd.DataFrame(lm.coef_, index=X.columns, columns=['Coefficient'])
print('Linear Regression Results:')
print('MSE:', lm_mse)
print(lm_coef)

# Logistic Regression
auto['mpg_high'] = (auto['mpg'] >= auto['mpg'].median()).astype(int)
X = auto.iloc[:, 1:-2]
y = auto['mpg_high']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
log_reg_acc = accuracy_score(y_test, y_pred)
log_reg_coef = pd.DataFrame(log_reg.coef_, index=['Coefficient'], columns=X.columns)
print('\nLogistic Regression Results:')
print('Accuracy:', log_reg_acc)
print(log_reg_coef)

# Ridge Regression
alphas = np.logspace(-4, 4, 100)
ridge = RidgeCV(alphas=alphas, cv=5)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
ridge_mse = mean_squared_error(y_test, y_pred)
ridge_coef = pd.DataFrame(ridge.coef_, index=X.columns, columns=['Coefficient'])
print('\nRidge Regression Results:')
print('MSE:', ridge_mse)
print(ridge_coef)

# Lasso Regression
lasso = LassoCV(alphas=alphas, cv=5)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
lasso_mse = mean_squared_error(y_test, y_pred)
lasso_coef = pd.DataFrame(lasso.coef_, index=X.columns, columns=['Coefficient'])
print('\nLasso Regression Results:')
print('MSE:', lasso_mse)
print(lasso_coef)


vif = pd.DataFrame()
vif["features"] = X_train.columns
vif["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

print(vif)

###################################

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, accuracy_score

# Load the Breast Cancer Wisconsin dataset
breast_cancer = load_breast_cancer()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(breast_cancer.data, breast_cancer.target, test_size=0.2, random_state=42)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
print("Linear Regression Mean Squared Error:", lr_mse)

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)
logreg_acc = accuracy_score(y_test, logreg_pred)
print("Logistic Regression Accuracy:", logreg_acc)

# Ridge Regression
ridge = Ridge(alpha=0.5)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_pred)
print("Ridge Regression Mean Squared Error:", ridge_mse)

# Lasso Regression
lasso = Lasso(alpha=0.5)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_pred)
print("Lasso Regression Mean Squared Error:", lasso_mse)

#######################################################

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import statsmodels.api as sm

# Load the Auto dataset
auto = pd.read_csv('/content/Auto.csv', na_values='?').dropna()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(auto.iloc[:, 1:-1], auto.mpg, test_size=0.3, random_state=42)

# Linear regression using OLS
X_train_ols = sm.add_constant(X_train) # add constant term to X_train
ols_model = sm.OLS(y_train, X_train_ols).fit() # fit OLS model to training data
X_test_ols = sm.add_constant(X_test) # add constant term to X_test
y_pred_ols = ols_model.predict(X_test_ols) # make predictions on testing data
ols_mse = mean_squared_error(y_test, y_pred_ols) # compute MSE
print("Linear Regression using OLS")
print(f"MSE: {ols_mse}")
print(ols_model.summary())

# # Logistic regression using GridSearchCV
# logr = LogisticRegression() # initialize logistic regression model
# # params = {'C':np.logspace(-4, 4, 9), 'penalty':['l1', 'l2']}
# # logreg_gs = GridSearchCV(logreg_model, params, cv=5) # perform grid search
# logr.fit(X_train, y_train) # fit grid search to training data
# y_pred_logr = logr.predict(X_test) # make predictions on testing data
# logreg_acc = accuracy_score(y_test, y_pred_logr) # compute accuracy
# print("Logistic Regression using GridSearchCV")
# print(f"Accuracy: {logreg_acc}")
# print(classification_report(y_test, y_pred_logr))

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)
logreg_acc = accuracy_score(y_test, logreg_pred)
print("Logistic Regression Accuracy:", logreg_acc)
print(classification_report(y_test, y_pred_logr))

# Ridge regression using RidgeCV
ridge_model = Ridge()
params = {'alpha': np.logspace(-4, 4, 9)}
ridge_cv = RidgeCV(alphas=params, cv=5) # perform cross-validation for hyperparameter tuning
ridge_cv.fit(X_train, y_train) # fit cross-validation to training data
y_pred_ridge = ridge_cv.predict(X_test) # make predictions on testing data
ridge_mse = mean_squared_error(y_test, y_pred_ridge) # compute MSE
print("Ridge Regression using RidgeCV")
print(f"MSE: {ridge_mse}")
print(f"Alpha: {ridge_cv.alpha_}")
print(f"Coefficients: {ridge_cv.coef_}")

# Lasso regression using LassoCV
lasso_model = Lasso()
params = {'alpha': np.logspace(-4, 4, 9)}
lasso_cv = LassoCV(alphas=params, cv=5) # perform cross-validation for hyperparameter tuning
lasso_cv.fit(X_train, y_train) # fit cross-validation to training
y_pred_lasso = lasso_cv.predict(X_test) # make predictions on testing data
lasso_mse = mean_squared_error(y_test, y_pred_lasso) # compute MSE
print("Lasso Regression using LassoCV")
print(f"MSE: {lasso_mse}")
print(f"Alpha: {lasso_cv.alpha_}")
print(f"Coefficients: {lasso_cv.coef_}")

#####################################

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, f_oneway
from statsmodels.multivariate.manova import MANOVA

# Load data
data = pd.read_csv('example_data.csv')

# Chi-square test (one-way)
# Hypothesis: there is a relationship between two categorical variables
cont_table = pd.crosstab(data['Variable1'], data['Variable2'])
chi2, p, dof, exp_freq = chi2_contingency(cont_table)
print(f"Chi-square test results:\nChi-square statistic: {chi2:.2f}\np-value: {p:.2f}")

# One-way ANOVA
# Hypothesis: there is a difference in means between multiple groups
group1 = data[data['Group'] == 'Group1']['Variable1']
group2 = data[data['Group'] == 'Group2']['Variable1']
group3 = data[data['Group'] == 'Group3']['Variable1']
f_stat, p = f_oneway(group1, group2, group3)
print(f"One-way ANOVA results:\nF-statistic: {f_stat:.2f}\np-value: {p:.2f}")

# Two-way ANOVA
# Hypothesis: there is a difference in means between multiple groups, considering two factors
formula = 'Variable1 ~ Group + Factor2'
model = smf.ols(formula=formula, data=data).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(f"Two-way ANOVA results:\n{aov_table}")

# MANOVA
# Hypothesis: there is a difference in means across multiple dependent variables
variables = ['Var1', 'Var2', 'Var3']
group1 = data[data['Group'] == 'Group1'][variables].values
group2 = data[data['Group'] == 'Group2'][variables].values
group3 = data[data['Group'] == 'Group3'][variables].values
manova = MANOVA.from_formula(f'{",".join(variables)} ~ Group', data=data)
print(f"MANOVA results:\n{manova.mv_test()}")