In [35]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [36]:
df = pd.read_csv('Plastic based Textiles in clothing industry.csv')

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6956 entries, 0 to 6955
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Company                   6956 non-null   object
 1   Product_Type              6956 non-null   object
 2   Production_Year           6956 non-null   int64 
 3   Greenhouse_Gas_Emissions  6956 non-null   int64 
 4   Pollutants_Emitted        6956 non-null   int64 
 5   Water_Consumption         6956 non-null   int64 
 6   Energy_Consumption        6956 non-null   int64 
 7   Waste_Generation          6956 non-null   int64 
 8   Sales_Revenue             6956 non-null   int64 
dtypes: int64(7), object(2)
memory usage: 489.2+ KB


In [38]:
list(df)

['Company',
 'Product_Type',
 'Production_Year',
 'Greenhouse_Gas_Emissions',
 'Pollutants_Emitted',
 'Water_Consumption',
 'Energy_Consumption',
 'Waste_Generation',
 'Sales_Revenue']

In [39]:
# Calculating median values
median_values = {
    "Greenhouse_Gas_Emissions": df["Greenhouse_Gas_Emissions"].median(),
    "Pollutants_Emitted": df["Pollutants_Emitted"].median(),
    "Water_Consumption": df["Water_Consumption"].median(),
    "Energy_Consumption": df["Energy_Consumption"].median(),
    "Waste_Generation": df["Waste_Generation"].median()
}

# Function to determine sustainability
def is_sustainable(row):
    conditions_met = sum([
        row["Greenhouse_Gas_Emissions"] <= median_values["Greenhouse_Gas_Emissions"],
        row["Pollutants_Emitted"] <= median_values["Pollutants_Emitted"],
        row["Water_Consumption"] <= median_values["Water_Consumption"],
        row["Energy_Consumption"] <= median_values["Energy_Consumption"],
        row["Waste_Generation"] <= median_values["Waste_Generation"]]
    )
    return "Yes" if conditions_met >= 2 else "No"

# Applying the function to the dataset
df["Sustainable"] = df.apply(is_sustainable, axis=1)


# Encoding categorical columns 'Product_Type' and 'Company' using Label Encoding
label_encoder = LabelEncoder()
df["Product_Type"] = label_encoder.fit_transform(df["Product_Type"])
df["Company"] = label_encoder.fit_transform(df["Company"])

# Selecting numerical columns for normalization
numerical_cols = ["Production_Year", "Greenhouse_Gas_Emissions", "Pollutants_Emitted", "Water_Consumption", "Energy_Consumption", "Waste_Generation", "Sales_Revenue"]

# Applying Min-Max Scaling
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Display the processed dataset
print(df)


      Company  Product_Type  Production_Year  Greenhouse_Gas_Emissions  \
0           4             5             0.50                  0.761905   
1           4             3             0.25                  0.285714   
2           4             6             0.75                  0.404762   
3           4             0             0.00                  0.047619   
4           4             7             1.00                  1.000000   
...       ...           ...              ...                       ...   
6951        0             8             0.00                  0.337619   
6952        3             2             0.25                  0.195714   
6953        3             4             0.25                  0.980952   
6954        0             2             0.50                  0.261190   
6955        3             3             0.50                  0.387619   

      Pollutants_Emitted  Water_Consumption  Energy_Consumption  \
0                 0.6875            0.87500 

In [40]:
list(df)

['Company',
 'Product_Type',
 'Production_Year',
 'Greenhouse_Gas_Emissions',
 'Pollutants_Emitted',
 'Water_Consumption',
 'Energy_Consumption',
 'Waste_Generation',
 'Sales_Revenue',
 'Sustainable']

In [41]:
columns_titles = [
 'Company',
 'Product_Type',
 'Production_Year',
 'Greenhouse_Gas_Emissions',
 'Pollutants_Emitted',
 'Water_Consumption',
 'Energy_Consumption',
 'Waste_Generation',
 'Sales_Revenue']
x = df.reindex(columns=columns_titles)
print(x)

      Company  Product_Type  Production_Year  Greenhouse_Gas_Emissions  \
0           4             5             0.50                  0.761905   
1           4             3             0.25                  0.285714   
2           4             6             0.75                  0.404762   
3           4             0             0.00                  0.047619   
4           4             7             1.00                  1.000000   
...       ...           ...              ...                       ...   
6951        0             8             0.00                  0.337619   
6952        3             2             0.25                  0.195714   
6953        3             4             0.25                  0.980952   
6954        0             2             0.50                  0.261190   
6955        3             3             0.50                  0.387619   

      Pollutants_Emitted  Water_Consumption  Energy_Consumption  \
0                 0.6875            0.87500 

In [42]:
df['Sustainable'] = df['Sustainable'].replace({'Yes': 1, 'No': 0})
y = df['Sustainable']
y

0       0
1       1
2       1
3       1
4       0
       ..
6951    1
6952    1
6953    1
6954    1
6955    1
Name: Sustainable, Length: 6956, dtype: int64

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [44]:
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(x_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [45]:
y_pred = model.predict_proba(x_test)
y_pred

array([[0.00523807, 0.99476193],
       [0.07212625, 0.92787375],
       [0.01330218, 0.98669782],
       ...,
       [0.01671546, 0.98328454],
       [0.82009362, 0.17990638],
       [0.01813601, 0.98186399]])

In [46]:
y_pred1 = model.predict(x_test)
y_pred1

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

In [47]:
accuracy = accuracy_score(y_test, y_pred1)
print("Accuracy:", accuracy)

Accuracy: 0.8994252873563219
