In [9]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, kstest

# Load the dataset
data = pd.read_csv('C:/Users/Antino/Desktop/al_ml_tranning_module_one/Garima_Rahul_code/invoices/combined_file.csv',
                 dtype={'column_name1': str, 'column_name2': float},
                 low_memory=False)  # Replace 'your_data.csv' with the actual filename

# Select the relevant numerical columns for PCA
numerical_cols = ['SubTotal', 'Total', 'Balance', 'Adjustment', 'Discount Amount', 'Item Total', 'Item Price']
numerical_data = data[numerical_cols]

# Normalize the numerical data
scaler = StandardScaler()
normalized_data = scaler.fit_transform(numerical_data)

# Apply PCA
pca = PCA()
principal_components = pca.fit_transform(normalized_data)

# Analyze the results
variance_explained = pca.explained_variance_ratio_
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'], index=numerical_cols)

# Print the variance explained by each principal component
print("Variance explained by each principal component:")
for i, explained_var in enumerate(variance_explained):
    print(f"PC{i+1}: {explained_var:.4f}")

# Print the loadings of the original variables on each principal component
print("\nLoadings of original variables on each principal component:")
print(loadings)


Variance explained by each principal component:
PC1: 0.3219
PC2: 0.1435
PC3: 0.1429
PC4: 0.1426
PC5: 0.1420
PC6: 0.1072
PC7: 0.0000

Loadings of original variables on each principal component:
                      PC1       PC2       PC3       PC4       PC5       PC6  \
SubTotal         0.644301  0.047883  0.005855 -0.036876 -0.042168 -0.280636   
Total            0.642771  0.024525  0.014220 -0.036028 -0.086822 -0.281363   
Balance          0.024534 -0.505534  0.332537 -0.731767  0.311569  0.026347   
Adjustment      -0.062881 -0.458796  0.163125  0.018459 -0.870228  0.000167   
Discount Amount -0.000198  0.194211  0.924412  0.318918  0.077670 -0.001608   
Item Total       0.405330 -0.062985 -0.014531  0.084535  0.003162  0.907950   
Item Price       0.053598 -0.699545 -0.088450  0.593854  0.360928 -0.130419   

                          PC7  
SubTotal         7.075693e-01  
Total           -7.057201e-01  
Balance          5.227139e-17  
Adjustment       3.612270e-02  
Discount Amoun

In [10]:
# Select the relevant numerical columns as input features
input_cols = ['SubTotal', 'Balance', 'Adjustment', 'Discount Amount', 'Item Total', 'Item Price']

# Define the target variable
target_col = 'Total'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[input_cols], data[target_col], test_size=0.2, random_state=42)

# Create a CART regression model
cart = DecisionTreeRegressor()

# Fit the model to the training data
cart.fit(X_train, y_train)

# Make predictions on the test data
y_pred = cart.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)


Mean Squared Error (MSE): 0.041996587581876924


In [11]:
# Select the relevant numerical columns for correlation test
correlation_cols = ['SubTotal', 'Total', 'Balance', 'Adjustment', 'Discount Amount', 'Item Total', 'Item Price']

# Perform Pearson correlation test for each pair of variables
for i, col1 in enumerate(correlation_cols):
    for col2 in correlation_cols[i+1:]:
        correlation, p_value = pearsonr(data[col1], data[col2])
        print(f"\nCorrelation between {col1} and {col2}:")
        print("Correlation coefficient:", correlation)
        print("P-value:", p_value)

# Select the relevant numerical columns for KS test
ks_cols = ['SubTotal', 'Total', 'Balance', 'Adjustment', 'Discount Amount', 'Item Total', 'Item Price']

# Perform KS test (one-tailed and two-tailed) for each variable
for col in ks_cols:
    _, one_tailed_p_value = kstest(data[col], 'norm', alternative='greater')
    _, two_tailed_p_value = kstest(data[col], 'norm', alternative='two-sided')
    print(f"\nKS Test for {col}:")
    print("One-tailed p-value:", one_tailed_p_value)
    print("Two-tailed p-value:", two_tailed_p_value)



Correlation between SubTotal and Total:
Correlation coefficient: 0.9986968647222841
P-value: 0.0

Correlation between SubTotal and Balance:
Correlation coefficient: 0.021573810997751786
P-value: 1.2536662935087152e-102

Correlation between SubTotal and Adjustment:
Correlation coefficient: -0.07664934791059529
P-value: 0.0

Correlation between SubTotal and Discount Amount:
Correlation coefficient: -0.00018533633831994254
P-value: 0.8534117235453929

Correlation between SubTotal and Item Total:
Correlation coefficient: 0.39098622791678345
P-value: 0.0

Correlation between SubTotal and Item Price:
Correlation coefficient: 0.034131292489575485
P-value: 6.45663048109556e-254

Correlation between Total and Balance:
Correlation coefficient: 0.021670574209987196
P-value: 1.558661576641308e-103

Correlation between Total and Adjustment:
Correlation coefficient: -0.025664591309085077
P-value: 1.972402620559538e-144

Correlation between Total and Discount Amount:
Correlation coefficient: -0.0001