In [3]:
# import pandas as pd
# import numpy as np
# import sklearn
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# # We are adding mean_absolute_error and r2_score
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# # 1. Create a Sample Dataset
# # We'll create a DataFrame that simulates real-world data with missing values,
# # a categorical feature, and a value that needs to be replaced.
# print("--- 1. Creating a Sample Dataset ---")
# data = {
#     'feature1': [10, 20, 30, np.nan, 50, 60, 70, 80, 90, 100],
#     'feature2': [15, 25, 35, 45, 55, 65, 75, np.nan, 95, 105],
#     'feature_cat': ['A', 'B', 'A', 'C', 'B', 'B', 'A', 'C', 'A', 'B'],
#     'garbage_val': [1, 2, -999, 4, 5, 6, 7, 8, -999, 10],
#     'target': [110, 125, 140, 155, 170, 185, 200, 215, 230, 245]
# }
# df = pd.DataFrame(data)
# print("Original DataFrame:")
# print(df)
# print("\n" + "="*50 + "\n")

# # 2. Data Preprocessing Steps

# # a) Finding and Handling Null Values
# print("--- 2a. Handling Null Values ---")
# print("Null values before processing:")
# print(df.isnull().sum())

# # Replace null values in 'feature1' with the mean.
# # We use the mean for numerical data to preserve the distribution.
# df['feature1'].fillna(df['feature1'].mean(), inplace=True)

# # Replace null values in 'feature2' with the median.
# # The median is a good choice if there are potential outliers.
# df['feature2'].fillna(df['feature2'].median(), inplace=True)

# print("\nNull values after processing:")
# print(df.isnull().sum())
# print("\n" + "="*50 + "\n")

# # b) Replacing Garbage Values
# print("--- 2b. Replacing Garbage Values ---")
# # Let's assume -999 is a garbage value that should be replaced with the mode.
# # The mode is useful for categorical or discrete data.
# # First, let's get the mode of the 'garbage_val' column, excluding the -999.
# clean_mode = df[df['garbage_val'] != -999]['garbage_val'].mode()[0]
# df['garbage_val'] = df['garbage_val'].replace(-999, clean_mode)
# print("DataFrame after replacing garbage values:")
# print(df)
# print("\n" + "="*50 + "\n")

# # c) Feature Engineering
# print("--- 2c. Feature Engineering ---")
# # Creating a new feature by combining existing ones.
# df['combined_feature'] = df['feature1'] + df['feature2']
# print("DataFrame after adding 'combined_feature':")
# print(df)
# print("\n" + "="*50 + "\n")

# # d) One-Hot Encoding for Categorical Data
# print("--- 2d. One-Hot Encoding Categorical Data ---")
# # 'feature_cat' is a categorical column. Linear regression requires numerical data,
# # so we'll convert it using one-hot encoding.
# df = pd.get_dummies(df, columns=['feature_cat'], drop_first=True)
# print("DataFrame after one-hot encoding 'feature_cat':")
# print(df)
# print("\n" + "="*50 + "\n")

# # 3. Feature Standardization and Normalization

# # Standardization (using StandardScaler)
# # It scales features to have a mean of 0 and a standard deviation of 1.
# # Useful for algorithms that assume a normal distribution.
# print("--- 3. Feature Scaling ---")
# scaler = StandardScaler()
# df[['feature1', 'feature2']] = scaler.fit_transform(df[['feature1', 'feature2']])
# print("DataFrame after standardizing 'feature1' and 'feature2':")
# print(df)
# print("\n" + "="*50 + "\n")

# # Normalization (using MinMaxScaler)
# # It scales features to a fixed range, usually 0 to 1.
# # Useful for algorithms that don't assume a normal distribution, like neural networks.
# print("\nChecking columns before normalization:")
# print(df.columns) # Added for debugging to check if 'combined_feature' exists
# minmax_scaler = MinMaxScaler()
# df[['combined_feature']] = minmax_scaler.fit_transform(df[['combined_feature']])
# print("DataFrame after normalizing 'combined_feature':")
# print(df)
# print("\n" + "="*50 + "\n")

# # 4. Applying Linear Regression

# # Define features (X) and target (y)
# X = df.drop('target', axis=1)
# y = df['target']

# # Split the data into training and testing sets
# # We use a 70/30 split, with 70% for training and 30% for testing.
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# print("Data split complete. X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
# print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)
# print("\n" + "="*50 + "\n")

# # Initialize and train the Linear Regression model
# print("--- 4. Training the Linear Regression Model ---")
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Make predictions on the test set
# predictions = model.predict(X_test)
# print("Predictions on the test set:")
# print(predictions)

# # Evaluate the model
# mse = mean_squared_error(y_test, predictions)
# mae = mean_absolute_error(y_test, predictions)
# r2 = r2_score(y_test, predictions)

# print(f"\nMean Squared Error (MSE) on the test set: {mse:.2f}")
# print(f"Mean Absolute Error (MAE) on the test set: {mae:.2f}")
# print(f"R-squared score on the test set: {r2:.2f}")


In [4]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# We are adding mean_absolute_error and r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Create a Sample Dataset
# We'll create a DataFrame that simulates real-world data with missing values,
# a categorical feature, and a value that needs to be replaced.
print("--- 1. Creating a Sample Dataset ---")
data = {
    'feature1': [10, 20, 30, np.nan, 50, 60, 70, 80, 90, 100],
    'feature2': [15, 25, 35, 45, 55, 65, 75, np.nan, 95, 105],
    'feature_cat': ['A', 'B', 'A', 'C', 'B', 'B', 'A', 'C', 'A', 'B'],
    'garbage_val': [1, 2, -999, 4, 5, 6, 7, 8, -999, 10],
    'target': [110, 125, 140, 155, 170, 185, 200, 215, 230, 245]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n" + "="*60 + "\n")

--- 1. Creating a Sample Dataset ---
Original DataFrame:
   feature1  feature2 feature_cat  garbage_val  target
0      10.0      15.0           A            1     110
1      20.0      25.0           B            2     125
2      30.0      35.0           A         -999     140
3       NaN      45.0           C            4     155
4      50.0      55.0           B            5     170
5      60.0      65.0           B            6     185
6      70.0      75.0           A            7     200
7      80.0       NaN           C            8     215
8      90.0      95.0           A         -999     230
9     100.0     105.0           B           10     245




In [5]:
# 2. Data Preprocessing Steps

# a) Finding and Handling Null Values
print("--- 2a. Handling Null Values ---")
print("Null values before processing:")
print(df.isnull().sum())

--- 2a. Handling Null Values ---
Null values before processing:
feature1       1
feature2       1
feature_cat    0
garbage_val    0
target         0
dtype: int64


In [7]:
# Replace null values in 'feature1' with the mean.
# We use the mean for numerical data to preserve the distribution.
df['feature1'] = df['feature1'].fillna(df['feature1'].mean())

In [8]:
# Replace null values in 'feature2' with the median.
# The median is a good choice if there are potential outliers.
df['feature2'] = df['feature2'].fillna(df['feature2'].median())

In [9]:
print("\nNull values after processing:")
print(df.isnull().sum())
print("\n" + "="*50 + "\n")


Null values after processing:
feature1       0
feature2       0
feature_cat    0
garbage_val    0
target         0
dtype: int64




In [8]:
# b) Replacing Garbage Values
print("--- 2b. Replacing Garbage Values ---")
# Let's assume -999 is a garbage value that should be replaced with the mode.
# The mode is useful for categorical or discrete data.
# First, let's get the mode of the 'garbage_val' column, excluding the -999.
clean_mode = df[df['garbage_val'] != -999]['garbage_val'].mode()[0]
df['garbage_val'] = df['garbage_val'].replace(-999, clean_mode)
print("DataFrame after replacing garbage values:")
print(df)
print("\n" + "="*50 + "\n")

--- 2b. Replacing Garbage Values ---
DataFrame after replacing garbage values:
     feature1  feature2 feature_cat  garbage_val  target
0   10.000000      15.0           A            1     110
1   20.000000      25.0           B            2     125
2   30.000000      35.0           A            1     140
3   56.666667      45.0           C            4     155
4   50.000000      55.0           B            5     170
5   60.000000      65.0           B            6     185
6   70.000000      75.0           A            7     200
7   80.000000      55.0           C            8     215
8   90.000000      95.0           A            1     230
9  100.000000     105.0           B           10     245




In [9]:
# c) Feature Engineering
print("--- 2c. Feature Engineering ---")
# Creating a new feature by combining existing ones.
df['combined_feature'] = df['feature1'] + df['feature2']
print("DataFrame after adding 'combined_feature':")
print(df)
print("\n" + "="*50 + "\n")

--- 2c. Feature Engineering ---
DataFrame after adding 'combined_feature':
     feature1  feature2 feature_cat  garbage_val  target  combined_feature
0   10.000000      15.0           A            1     110         25.000000
1   20.000000      25.0           B            2     125         45.000000
2   30.000000      35.0           A            1     140         65.000000
3   56.666667      45.0           C            4     155        101.666667
4   50.000000      55.0           B            5     170        105.000000
5   60.000000      65.0           B            6     185        125.000000
6   70.000000      75.0           A            7     200        145.000000
7   80.000000      55.0           C            8     215        135.000000
8   90.000000      95.0           A            1     230        185.000000
9  100.000000     105.0           B           10     245        205.000000




In [10]:
# d) One-Hot Encoding for Categorical Data
print("--- 2d. One-Hot Encoding Categorical Data ---")
# 'feature_cat' is a categorical column. Linear regression requires numerical data,
# so we'll convert it using one-hot encoding.
df = pd.get_dummies(df, columns=['feature_cat'], drop_first=True)
print("DataFrame after one-hot encoding 'feature_cat':")
print(df)
print("\n" + "="*50 + "\n")

--- 2d. One-Hot Encoding Categorical Data ---
DataFrame after one-hot encoding 'feature_cat':
     feature1  feature2  garbage_val  target  combined_feature  feature_cat_B  \
0   10.000000      15.0            1     110         25.000000          False   
1   20.000000      25.0            2     125         45.000000           True   
2   30.000000      35.0            1     140         65.000000          False   
3   56.666667      45.0            4     155        101.666667          False   
4   50.000000      55.0            5     170        105.000000           True   
5   60.000000      65.0            6     185        125.000000           True   
6   70.000000      75.0            7     200        145.000000          False   
7   80.000000      55.0            8     215        135.000000          False   
8   90.000000      95.0            1     230        185.000000          False   
9  100.000000     105.0           10     245        205.000000           True   

   feature_cat

In [11]:
# 3. Feature Standardization and Normalization

# Standardization (using StandardScaler)
# It scales features to have a mean of 0 and a standard deviation of 1.
# Useful for algorithms that assume a normal distribution.
print("--- 3. Feature Scaling ---")
scaler = StandardScaler()
df[['feature1', 'feature2']] = scaler.fit_transform(df[['feature1', 'feature2']])
print("DataFrame after standardizing 'feature1' and 'feature2':")
print(df)
print("\n" + "="*50 + "\n")

--- 3. Feature Scaling ---
DataFrame after standardizing 'feature1' and 'feature2':
   feature1  feature2  garbage_val  target  combined_feature  feature_cat_B  \
0 -1.649916 -1.527525            1     110         25.000000          False   
1 -1.296362 -1.163829            2     125         45.000000           True   
2 -0.942809 -0.800132            1     140         65.000000          False   
3  0.000000 -0.436436            4     155        101.666667          False   
4 -0.235702 -0.072739            5     170        105.000000           True   
5  0.117851  0.290957            6     185        125.000000           True   
6  0.471405  0.654654            7     200        145.000000          False   
7  0.824958 -0.072739            8     215        135.000000          False   
8  1.178511  1.382047            1     230        185.000000          False   
9  1.532065  1.745743           10     245        205.000000           True   

   feature_cat_C  
0          False  
1       

In [12]:
# Normalization (using MinMaxScaler)
# It scales features to a fixed range, usually 0 to 1.
# Useful for algorithms that don't assume a normal distribution, like neural networks.
print("\nChecking columns before normalization:")
print(df.columns) # Added for debugging to check if 'combined_feature' exists
minmax_scaler = MinMaxScaler()
df[['combined_feature']] = minmax_scaler.fit_transform(df[['combined_feature']])
print("DataFrame after normalizing 'combined_feature':")
print(df)
print("\n" + "="*50 + "\n")


Checking columns before normalization:
Index(['feature1', 'feature2', 'garbage_val', 'target', 'combined_feature',
       'feature_cat_B', 'feature_cat_C'],
      dtype='object')
DataFrame after normalizing 'combined_feature':
   feature1  feature2  garbage_val  target  combined_feature  feature_cat_B  \
0 -1.649916 -1.527525            1     110          0.000000          False   
1 -1.296362 -1.163829            2     125          0.111111           True   
2 -0.942809 -0.800132            1     140          0.222222          False   
3  0.000000 -0.436436            4     155          0.425926          False   
4 -0.235702 -0.072739            5     170          0.444444           True   
5  0.117851  0.290957            6     185          0.555556           True   
6  0.471405  0.654654            7     200          0.666667          False   
7  0.824958 -0.072739            8     215          0.611111          False   
8  1.178511  1.382047            1     230          0.888889 

In [13]:
# 4. Applying Linear Regression

# Define features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

In [14]:
# Split the data into training and testing sets
# We use a 70/30 split, with 70% for training and 30% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Data split complete. X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)
print("\n" + "="*50 + "\n")

Data split complete. X_train shape: (7, 6) y_train shape: (7,)
X_test shape: (3, 6) y_test shape: (3,)




In [15]:
# Initialize and train the Linear Regression model
print("--- 4. Training the Linear Regression Model ---")
model = LinearRegression()
model.fit(X_train, y_train)

--- 4. Training the Linear Regression Model ---


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [16]:
# Make predictions on the test set
predictions = model.predict(X_test)
print("Predictions on the test set:")
print(predictions)

Predictions on the test set:
[230. 125. 185.]


In [18]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"\nMean Squared Error (MSE) on the test set: {mse:.2f}")
print(f"Mean Absolute Error (MAE) on the test set: {mae:.2f}")
print(f"R-squared score on the test set: {r2:.2f}")


Mean Squared Error (MSE) on the test set: 0.00
Mean Absolute Error (MAE) on the test set: 0.00
R-squared score on the test set: 1.00
