In [None]:
import pandas as pd

df=pd.read_csv('https://raw.githubusercontent.com/rahulinchal/SPPU/refs/heads/main/Data/concrete_Data.csv')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


features = df.columns[:-1].tolist()
target = 'strength'

In [3]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered

df_no_outliers = df.copy()
for col in features:  # Exclude the last column (target variable)
    df_no_outliers = remove_outliers_iqr(df_no_outliers, col)

print("Shape of the dataset before outlier removal:", df.shape)
print("Shape of the dataset after outlier removal:", df_no_outliers.shape)

# Let's proceed with the dataset after outlier removal for further steps
df = df_no_outliers

Shape of the dataset before outlier removal: (1030, 9)
Shape of the dataset after outlier removal: (930, 9)


In [4]:
# Check Skewness and Kurtosis
import numpy as np
print("\nSkewness of features:")
print(df[features].skew())
print("\nKurtosis of features:")
print(df[features].kurt())

# Apply Log Transformation to features with high positive skewness (e.g., age)
# We'll identify highly skewed features based on the skewness values
highly_skewed_features = df[features].skew()[abs(df[features].skew()) > 0.5].index

for feature in highly_skewed_features:
    df[feature] = np.log1p(df[feature])

print("\nSkewness of features after transformation:")
print(df[features].skew())
print()
print(df[features].kurt())



Skewness of features:
cement          0.544754
slag            0.767062
ash             0.400773
water          -0.089035
superplastic    0.266093
coarseagg      -0.128488
fineagg        -0.182324
age             1.249826
dtype: float64

Kurtosis of features:
cement         -0.447356
slag           -0.659990
ash            -1.438439
water          -0.386411
superplastic   -0.709014
coarseagg      -0.598176
fineagg        -0.090915
age             0.634802
dtype: float64

Skewness of features after transformation:
cement         -0.093609
slag           -0.001360
ash             0.400773
water          -0.089035
superplastic    0.266093
coarseagg      -0.128488
fineagg        -0.182324
age            -0.393418
dtype: float64

cement         -0.859936
slag           -1.881326
ash            -1.438439
water          -0.386411
superplastic   -0.709014
coarseagg      -0.598176
fineagg        -0.090915
age            -0.720630
dtype: float64


In [5]:
X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (744, 8)
Shape of X_test: (186, 8)
Shape of y_train: (744,)
Shape of y_test: (186,)


In [6]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

scaled = LinearRegression()
scaled.fit(X_train_scaled, y_train)
y_pred_scaled = scaled.predict(X_test_scaled)

In [8]:
diff_scaled = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_scaled})
diff_scaled['Difference'] = diff_scaled['Actual'] - diff_scaled['Predicted']
print("\nDifference between Actual and Predicted (Linear Regression - Scaled):")
diff_scaled


Difference between Actual and Predicted (Linear Regression - Scaled):


Unnamed: 0,Actual,Predicted,Difference
905,40.06,32.307015,7.752985
659,12.73,13.377696,-0.647696
141,30.39,27.576596,2.813404
71,50.46,46.359732,4.100268
392,49.20,47.612947,1.587053
...,...,...,...
351,17.37,15.900672,1.469328
389,32.76,31.787885,0.972115
470,12.45,6.371933,6.078067
205,24.29,17.473385,6.816615


In [9]:
# # Predictions
# y_train_pred_scaled = scaled.predict(X_train_scaled)
# y_test_pred_scaled = scaled.predict(X_test_scaled)

# # R² Scores
# r2_train = r2_score(y_train, y_train_pred_scaled)
# r2_test = r2_score(y_test, y_test_pred_scaled)

# print(f"R² Score (Train, Scaled): {r2_train:.5f}")
# print(f"R² Score (Test, Scaled): {r2_test:.5f}")
# print(f'{abs(r2_train - r2_test):.5f}')

# y_train_pred = linear_reg.predict(X_train)
# y_test_pred = linear_reg.predict(X_test)

# r2_train_unscaled = r2_score(y_train, y_train_pred)
# r2_test_unscaled = r2_score(y_test, y_test_pred)

# print(f"R² Score (Train, Unscaled): {r2_train_unscaled:.5f}")
# print(f"R² Score (Test, Unscaled): {r2_test_unscaled:.5f}")
# print(f'{abs(r2_train_unscaled - r2_test_unscaled):.5f}')

In [10]:
df

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,4.957938,5.361292,0.0,203.5,0.0,971.8,748.5,3.367296,29.89
1,5.135210,3.765840,124.3,158.3,10.8,1080.8,796.2,2.708050,23.51
2,5.525453,0.000000,95.7,187.4,5.5,956.9,861.2,3.367296,29.22
3,5.587249,4.744932,0.0,228.0,0.0,932.0,670.0,3.367296,45.85
4,5.048573,5.217107,0.0,193.3,9.1,1047.4,696.7,3.367296,18.29
...,...,...,...,...,...,...,...,...,...
1023,4.957938,5.361292,0.0,203.5,0.0,971.8,748.5,2.079442,10.39
1024,5.697764,0.000000,117.5,174.8,9.5,1022.8,753.5,1.386294,21.91
1025,4.912655,0.000000,166.0,180.0,10.0,961.0,805.0,3.367296,13.29
1027,5.625461,4.762174,90.3,179.6,8.9,870.1,768.3,3.367296,44.28
