In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [2]:
df=pd.read_csv('s3://p4-aparanji/data-input-folder/winequality-white.csv', delimiter=";")

In [3]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [4]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
# Assume all features are continuous except the target 'quality'
continuous_features = df.columns.difference(['quality']).tolist()

# Transform 'quality' into categorical bins
df['quality'] = pd.cut(df['quality'], bins=[0, 4, 7, 10], labels=['Low', 'Average', 'High'])
# Initialize LabelEncoder and encode the quality labels
label_encoder = LabelEncoder()
df['quality_encoded']=label_encoder.fit_transform(df['quality'])
# Define a pipeline for transforming the data
pipeline = ColumnTransformer([
    ('scaler', StandardScaler(), continuous_features)  # Apply standardization
    # Add other transformers here if needed
])


In [7]:
# Split data into features and target
X = df.drop('quality', axis=1)
y = df['quality_encoded']

In [8]:
# Split data into train, validate, test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_validate, X_test, y_validate, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Fit the pipeline on the training data
X_train_transformed = pipeline.fit_transform(X_train)
X_validate_transformed = pipeline.transform(X_validate)
X_test_transformed = pipeline.transform(X_test)

# Convert transformed arrays back to DataFrame
X_train_transformed = pd.DataFrame(X_train_transformed, columns=continuous_features)
X_validate_transformed = pd.DataFrame(X_validate_transformed, columns=continuous_features)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=continuous_features)

# Save the processed data to CSV (These steps are the same as before, now with transformed data)
X_train_transformed.to_csv('train_features.csv', index=False)
X_validate_transformed.to_csv('validate_features.csv', index=False)
X_test_transformed.to_csv('test_features.csv', index=False)
y_train.to_csv('train_labels.csv', index=False)
y_validate.to_csv('validate_labels.csv', index=False)
y_test.to_csv('test_labels.csv', index=False)

In [9]:
# Initialize the models
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

# Train the models
rf_model.fit(X_train_transformed, y_train)
gb_model.fit(X_train_transformed, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test_transformed)
gb_predictions = gb_model.predict(X_test_transformed)

# Evaluate the models
def evaluate_model(predictions, y_test):
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='macro')
    recall = recall_score(y_test, predictions, average='macro')
    return accuracy, precision, recall

rf_accuracy, rf_precision, rf_recall = evaluate_model(rf_predictions, y_test)
gb_accuracy, gb_precision, gb_recall = evaluate_model(gb_predictions, y_test)

print("Random Forest Metrics:")
print("Accuracy: {:.2f}".format(rf_accuracy))
print("Precision: {:.2f}".format(rf_precision))
print("Recall: {:.2f}".format(rf_recall))

print("\nGradient Boosting Machine Metrics:")
print("Accuracy: {:.2f}".format(gb_accuracy))
print("Precision: {:.2f}".format(gb_precision))
print("Recall: {:.2f}".format(gb_recall))

Random Forest Metrics:
Accuracy: 0.96
Precision: 0.83
Recall: 0.63

Gradient Boosting Machine Metrics:
Accuracy: 0.94
Precision: 0.68
Recall: 0.47


In [10]:
validate_prediction_gb = gb_model.predict(X_validate_transformed)
v_gb_accuracy, v_gb_precision, v_gb_recall = evaluate_model(validate_prediction_gb, y_validate)
print("Gradient Boost Metrics for validate data:")
print("Accuracy: {:.2f}".format(v_gb_accuracy))
print("Precision: {:.2f}".format(v_gb_precision))
print("Recall: {:.2f}".format(v_gb_recall))

Gradient Boost Metrics for validate data:
Accuracy: 0.94
Precision: 0.80
Recall: 0.49


In [11]:
validate_prediction_rf=rf_model.predict(X_validate_transformed)
v_rf_accuracy, v_rf_precision, v_rf_recall = evaluate_model(validate_prediction_rf, y_validate)
print("Random Forest Metrics for validate data:")
print("Accuracy: {:.2f}".format(v_rf_accuracy))
print("Precision: {:.2f}".format(v_rf_precision))
print("Recall: {:.2f}".format(v_rf_recall))

Random Forest Metrics for validate data:
Accuracy: 0.95
Precision: 0.93
Recall: 0.51


In [12]:
# Mapping dictionary
quality_mapping = {0: 'Average Quality', 1: 'High Quality', 2: 'Low Quality'}

# Apply mapping to the array
validate_prediction_mapped_rf= [quality_mapping[prediction] for prediction in validate_prediction_rf]

# Create a DataFrame
df_prediction_rf = pd.DataFrame(validate_prediction_mapped_rf, columns=['Predicted Quality'])

# Save as CSV
df_prediction_rf.to_csv('predicted_quality_rf.csv', index=False)


In [13]:

# Apply mapping to the array
validate_prediction_mapped_gb= [quality_mapping[prediction] for prediction in validate_prediction_gb]

# Create a DataFrame
df_prediction_gb = pd.DataFrame(validate_prediction_mapped_gb, columns=['Predicted Quality'])

# Save as CSV
df_prediction_gb.to_csv('predicted_quality_gb.csv', index=False)

In [18]:
!pip install prettytable

Collecting prettytable
  Downloading prettytable-3.10.0-py3-none-any.whl.metadata (30 kB)
Downloading prettytable-3.10.0-py3-none-any.whl (28 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.10.0


In [14]:
from prettytable import PrettyTable
import numpy as np
rf_table = PrettyTable()
rf_table.field_names = ["Performance Metric", "Testing Set", "Validation Set"]
rf_table.add_row(["Accuracy", np.round(rf_accuracy,2), np.round(v_rf_accuracy,2)])
rf_table.add_row(["Precision", np.round(rf_precision,2), np.round(v_rf_precision,2)])
rf_table.add_row(["Recall", np.round(rf_recall,2), np.round(v_rf_recall,2)])

gb_table = PrettyTable()
gb_table.field_names = ["Performance Metric", "Testing Set", "Validation Set"]
gb_table.add_row(["Accuracy", np.round(gb_accuracy,2), np.round(v_gb_accuracy,2)])
gb_table.add_row(["Precision", np.round(gb_precision,2), np.round(v_gb_precision,2)])
gb_table.add_row(["Recall", np.round(gb_recall,2), np.round(v_gb_recall,2)])

print("Performance Metric for random forest Classifier:")
print(rf_table)
print("\nPerformance Metric for gradient boost Classifier:")
print(gb_table)

Performance Metric for random forest Classifier:
+--------------------+-------------+----------------+
| Performance Metric | Testing Set | Validation Set |
+--------------------+-------------+----------------+
|      Accuracy      |     0.96    |      0.95      |
|     Precision      |     0.83    |      0.93      |
|       Recall       |     0.63    |      0.51      |
+--------------------+-------------+----------------+

Performance Metric for gradient boost Classifier:
+--------------------+-------------+----------------+
| Performance Metric | Testing Set | Validation Set |
+--------------------+-------------+----------------+
|      Accuracy      |     0.94    |      0.94      |
|     Precision      |     0.68    |      0.8       |
|       Recall       |     0.47    |      0.49      |
+--------------------+-------------+----------------+


In [15]:
with open("p4_aparanji_metrics.txt", "w") as file:
    file.write("Performance Metrics for random forest Classifier:\n")
    file.write(str(rf_table))
    file.write("\n\nPerformance Metrics for gradient boost Classifier:\n")
    file.write(str(gb_table))

In [18]:
import boto3
s3 = boto3.client('s3',region_name = 'eu-south-2')

In [22]:
s3_bucket_p4_aparanji = "p4-aparanji"
s3_folder_path = "p4-Aparanji/p4_aparanji_metrics.txt"
s3.upload_file('p4_aparanji_metrics.txt', s3_bucket_p4_aparanji, s3_folder_path)
print("Metrics file uploaded successfully to the S3 Bucket with the specified file path:", s3_folder_path)

Metrics file uploaded successfully to the S3 Bucket with the specified file path: p4-Aparanji/p4_aparanji_metrics.txt


In [20]:
s3_classifier_path = "p4-Aparanji/Random Forest Classifier Model/predicted_quality_rf.csv" 
s3.upload_file('predicted_quality_rf.csv', s3_bucket_p4_aparanji, s3_classifier_path)
print("Prediction results file uploaded successfully to the S3 Bucket with file path:", s3_classifier_path)

Prediction results file uploaded successfully to the S3 Bucket with file path: p4-Aparanji/Random Forest Classifier Model/predicted_quality_rf.csv


In [21]:
s3_classifier_path = "p4-Aparanji/Gradient Boost Classifier Model/predicted_quality_gb.csv" 
s3.upload_file('predicted_quality_gb.csv', s3_bucket_p4_aparanji, s3_classifier_path)
print("Prediction results file uploaded successfully to the S3 Bucket with file path:", s3_classifier_path)

Prediction results file uploaded successfully to the S3 Bucket with file path: p4-Aparanji/Gradient Boost Classifier Model/predicted_quality_gb.csv
