In [2]:
#Data Loading
import pandas as pd

data = pd.read_csv('online_retail.csv', encoding='latin1')  # Try latin1 or another encoding
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      541910 non-null  object 
 1   StockCode    541910 non-null  object 
 2   Description  540456 non-null  object 
 3   Quantity     541910 non-null  int64  
 4   InvoiceDate  541910 non-null  object 
 5   Price        541910 non-null  float64
 6   Customer ID  406830 non-null  float64
 7   Country      541910 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
None


In [3]:
print(data.isnull().sum())

Invoice             0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
Price               0
Customer ID    135080
Country             0
dtype: int64


In [4]:
pip install scikit-learn



In [5]:
#For Quantity and Price, use the mean or median if numeric.
#For Country and InvoiceDate, use the most frequent value or infer based on related data.
data['Quantity'].fillna(data['Quantity'].median(), inplace=True)
data['Price'].fillna(data['Price'].mean(), inplace=True)
data['InvoiceDate'].fillna(method='ffill', inplace=True)  # Forward-fill the date
data['Country'].fillna(data['Country'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Quantity'].fillna(data['Quantity'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Price'].fillna(data['Price'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

In [6]:
#For Description Column: Replace missing values with "Unknown" or "No Description"
data['Description'].fillna("No Description", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Description'].fillna("No Description", inplace=True)


In [7]:
# For missing Customer ID around 123,840 of this column missing
#Dropping of rows with missing customer ID's
data = data.dropna(subset=['Customer ID'])

In [8]:
print(data.isnull().sum())

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64


In [9]:
data = data[~data['Invoice'].str.startswith('C', na=False)]

In [10]:
data = data.drop_duplicates()

In [11]:
print(data.describe())

            Quantity          Price    Customer ID
count  392733.000000  392733.000000  392733.000000
mean       13.153687       3.125633   15287.728182
std       181.588189      22.240710    1713.570644
min         1.000000       0.000000   12346.000000
25%         2.000000       1.250000   13955.000000
50%         6.000000       1.950000   15150.000000
75%        12.000000       3.750000   16791.000000
max     80995.000000    8142.750000   18287.000000


In [12]:
#Isolation Forest: A machine learning algorithm for anomaly detection
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.01, random_state=42)
data['Anomaly'] = iso.fit_predict(data[['Quantity', 'Price']])
data = data[data['Anomaly'] == 1].drop(columns=['Anomaly'])

In [13]:
# Remove rows with negative values in 'Quantity' or 'UnitPrice'
data = data[(data['Quantity'] >= 1) & (data['Price'] >= 1)]

In [14]:
print(data[['Quantity', 'Price']].min())

Quantity    1.0
Price       1.0
dtype: float64


In [15]:
#Feature Engineering
#Revenue=Quantity*Price
data['Revenue'] = data['Quantity'] * data['Price']

In [16]:
#Encode categorical Columns like Country
data = pd.get_dummies(data, columns=['Country'], drop_first=True)


In [17]:
print(data.columns)


Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Revenue', 'Country_Austria', 'Country_Bahrain',
       'Country_Belgium', 'Country_Brazil', 'Country_Canada',
       'Country_Channel Islands', 'Country_Cyprus', 'Country_Czech Republic',
       'Country_Denmark', 'Country_EIRE', 'Country_European Community',
       'Country_Finland', 'Country_France', 'Country_Germany',
       'Country_Greece', 'Country_Iceland', 'Country_Israel', 'Country_Italy',
       'Country_Japan', 'Country_Lebanon', 'Country_Lithuania',
       'Country_Malta', 'Country_Netherlands', 'Country_Norway',
       'Country_Poland', 'Country_Portugal', 'Country_RSA',
       'Country_Saudi Arabia', 'Country_Singapore', 'Country_Spain',
       'Country_Sweden', 'Country_Switzerland', 'Country_USA',
       'Country_United Arab Emirates', 'Country_United Kingdom',
       'Country_Unspecified'],
      dtype='object')


In [18]:
data.columns = data.columns.str.strip()  # Remove any leading/trailing spaces from column names

In [19]:
import pandas as pd

# Try loading the dataset with a different encoding
X = pd.read_csv('online_retail.csv', encoding='ISO-8859-1')  # Replace with the correct file path

# Step 2: Select numeric columns
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns

# Step 3: Impute missing values for numeric columns
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_numeric_imputed = imputer.fit_transform(X[numeric_columns])

# Step 4: List all columns
print("All columns in the original dataset:", X.columns)

# Step 5: Check the number of columns in the imputed data
print("Shape of imputed data:", X_numeric_imputed.shape)

# Step 6: Ensure the imputed DataFrame has the correct number of columns
X_imputed = pd.DataFrame(X_numeric_imputed, columns=numeric_columns[:X_numeric_imputed.shape[1]])

# Step 7: Add the non-numeric (datetime) columns back
datetime_columns = X.select_dtypes(include=['datetime']).columns
X_imputed = pd.concat([X_imputed, X[datetime_columns]], axis=1)

# Step 8: Check the final imputed data
print("Final imputed data:")
print(X_imputed.head())

All columns in the original dataset: Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')
Shape of imputed data: (541910, 3)
Final imputed data:
   Quantity  Price  Customer ID
0       6.0   2.55      17850.0
1       6.0   3.39      17850.0
2       8.0   2.75      17850.0
3       6.0   3.39      17850.0
4       6.0   3.39      17850.0


In [20]:
print(data['InvoiceDate'].isnull().sum())  # Count missing values in InvoiceDate

0


In [21]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')

In [22]:
invalid_dates = data[data['InvoiceDate'].isna()]
print(invalid_dates)

       Invoice StockCode                        Description  Quantity  \
26732   538521     21754           HOME BUILDING BLOCK WORD         3   
26733   538521     21755           LOVE BUILDING BLOCK WORD         3   
26734   538521     22072  RED RETROSPOT TEA CUP AND SAUCER          8   
26735   538521     22846         BREAD BIN DINER STYLE RED          1   
26736   538521     22849         BREAD BIN DINER STYLE MINT         1   
...        ...       ...                                ...       ...   
516364  579885    85034C        3 ROSE MORRIS BOXED CANDLES         4   
516365  579885     21742        LARGE ROUND WICKER PLATTER          2   
516366  579885     23084                 RABBIT NIGHT LIGHT         6   
516367  579885     21257        VICTORIAN SEWING BOX MEDIUM         1   
516368  579885     21259        VICTORIAN SEWING BOX SMALL          1   

       InvoiceDate  Price  Customer ID  Revenue  Country_Austria  \
26732          NaT   5.95      14180.0    17.85        

In [23]:
# Check unique values in InvoiceDate to identify any inconsistencies
print(data['InvoiceDate'].unique())

<DatetimeArray>
['2010-01-12 08:26:00', '2010-01-12 08:28:00', '2010-01-12 08:34:00',
 '2010-01-12 08:35:00', '2010-01-12 08:45:00', '2010-01-12 09:00:00',
 '2010-01-12 09:01:00', '2010-01-12 09:02:00', '2010-01-12 09:09:00',
 '2010-01-12 09:32:00',
 ...
 '2011-09-12 12:09:00', '2011-09-12 12:16:00', '2011-09-12 12:19:00',
 '2011-09-12 12:20:00', '2011-09-12 12:21:00', '2011-09-12 12:23:00',
 '2011-09-12 12:25:00', '2011-09-12 12:31:00', '2011-09-12 12:49:00',
 '2011-09-12 12:50:00']
Length: 7121, dtype: datetime64[ns]


In [24]:
# Convert to datetime using the specified format
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%d/%m/%Y', errors='coerce')
print(data['InvoiceDate'])

0        2010-01-12 08:26:00
1        2010-01-12 08:26:00
2        2010-01-12 08:26:00
3        2010-01-12 08:26:00
4        2010-01-12 08:26:00
                 ...        
541905   2011-09-12 12:50:00
541906   2011-09-12 12:50:00
541907   2011-09-12 12:50:00
541908   2011-09-12 12:50:00
541909   2011-09-12 12:50:00
Name: InvoiceDate, Length: 298411, dtype: datetime64[ns]


In [25]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 298411 entries, 0 to 541909
Data columns (total 44 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   Invoice                       298411 non-null  object        
 1   StockCode                     298411 non-null  object        
 2   Description                   298411 non-null  object        
 3   Quantity                      298411 non-null  int64         
 4   InvoiceDate                   126462 non-null  datetime64[ns]
 5   Price                         298411 non-null  float64       
 6   Customer ID                   298411 non-null  float64       
 7   Revenue                       298411 non-null  float64       
 8   Country_Austria               298411 non-null  bool          
 9   Country_Bahrain               298411 non-null  bool          
 10  Country_Belgium               298411 non-null  bool          
 11  Country_Brazil    

In [26]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 298411 entries, 0 to 541909
Data columns (total 44 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   Invoice                       298411 non-null  object        
 1   StockCode                     298411 non-null  object        
 2   Description                   298411 non-null  object        
 3   Quantity                      298411 non-null  int64         
 4   InvoiceDate                   126462 non-null  datetime64[ns]
 5   Price                         298411 non-null  float64       
 6   Customer ID                   298411 non-null  float64       
 7   Revenue                       298411 non-null  float64       
 8   Country_Austria               298411 non-null  bool          
 9   Country_Bahrain               298411 non-null  bool          
 10  Country_Belgium               298411 non-null  bool          
 11  Country_Brazil    

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import f1_score, roc_auc_score, mean_squared_error, r2_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Assuming 'data' is already loaded and contains a 'Revenue' column

# Convert datetime columns to numeric features (extract year, month, day, etc.)
datetime_columns = data.select_dtypes(include=['datetime']).columns

for col in datetime_columns:
    data[col] = data[col].apply(lambda x: x.year * 10000 + x.month * 100 + x.day)  # Convert datetime to YYYYMMDD format

# Drop the 'Invoice' column and handle missing values (if any)
X = data.drop(columns=['Revenue', 'Invoice'])
y = data['Revenue']

# Convert revenue into binary classification (e.g., High or Low revenue)
threshold = y.median()  # Using median revenue to categorize
y_class = (y > threshold).astype(int)  # 1 for high, 0 for low

# Identify categorical and numeric columns
categorical_columns = X.select_dtypes(include=['object', 'bool']).columns
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns

# Create separate imputer for numeric and categorical columns
numeric_imputer = SimpleImputer(strategy='mean')  # For numeric columns, use mean
categorical_imputer = SimpleImputer(strategy='most_frequent')  # For categorical columns, use the most frequent value

# Create a column transformer to apply different imputers for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_imputer, numeric_columns),  # Apply numeric imputer to numeric columns
        ('cat', Pipeline([
            ('imputer', categorical_imputer),  # Impute missing categorical values
            ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Apply OneHotEncoder to categorical columns
        ]), categorical_columns)
    ],
    remainder='passthrough'  # Keep the rest of the columns as is (e.g., datetime)
)

# Create a pipeline that first imputes missing values, preprocesses the data, and then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Handle missing data and apply transformations
    ('model', DecisionTreeRegressor(random_state=42))  # Fit the decision tree regressor
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Convert predictions to binary (since we are performing a binary classification task)
y_pred_binary = (y_pred > 0.5).astype(int)  # Assuming 0.5 as the threshold

# Calculate F1-Score
f1 = f1_score(y_test, y_pred_binary)
print(f"F1-score: {f1:.2f}")

# Calculate ROC AUC Score
roc_auc = roc_auc_score(y_test, y_pred_binary)
print(f"ROC AUC Score: {roc_auc:.2f}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Calculate R-squared (R²) score
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

# Calculate "accuracy" (percentage of variance explained)
accuracy = r2 * 100  # R² as a percentage
print(f"Accuracy: {accuracy:.2f}%")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:")
print(conf_matrix)

# You can also check a few predictions to verify
print("Predictions:", y_pred[:5])  # Print first 5 predictions

F1-score: 1.00
ROC AUC Score: 1.00
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0
R-squared: 1.0
Accuracy: 100.00%
Confusion Matrix:
[[32524     0]
 [    0 27159]]
Predictions: [1. 1. 0. 0. 1.]


In [28]:
print(data.columns)

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Revenue', 'Country_Austria', 'Country_Bahrain',
       'Country_Belgium', 'Country_Brazil', 'Country_Canada',
       'Country_Channel Islands', 'Country_Cyprus', 'Country_Czech Republic',
       'Country_Denmark', 'Country_EIRE', 'Country_European Community',
       'Country_Finland', 'Country_France', 'Country_Germany',
       'Country_Greece', 'Country_Iceland', 'Country_Israel', 'Country_Italy',
       'Country_Japan', 'Country_Lebanon', 'Country_Lithuania',
       'Country_Malta', 'Country_Netherlands', 'Country_Norway',
       'Country_Poland', 'Country_Portugal', 'Country_RSA',
       'Country_Saudi Arabia', 'Country_Singapore', 'Country_Spain',
       'Country_Sweden', 'Country_Switzerland', 'Country_USA',
       'Country_United Arab Emirates', 'Country_United Kingdom',
       'Country_Unspecified'],
      dtype='object')


In [29]:
data.columns = data.columns.str.strip()  # Remove any leading/trailing spaces
data.columns = data.columns.str.lower()  # Convert all column names to lowercase

In [30]:
print(data.columns)

Index(['invoice', 'stockcode', 'description', 'quantity', 'invoicedate',
       'price', 'customer id', 'revenue', 'country_austria', 'country_bahrain',
       'country_belgium', 'country_brazil', 'country_canada',
       'country_channel islands', 'country_cyprus', 'country_czech republic',
       'country_denmark', 'country_eire', 'country_european community',
       'country_finland', 'country_france', 'country_germany',
       'country_greece', 'country_iceland', 'country_israel', 'country_italy',
       'country_japan', 'country_lebanon', 'country_lithuania',
       'country_malta', 'country_netherlands', 'country_norway',
       'country_poland', 'country_portugal', 'country_rsa',
       'country_saudi arabia', 'country_singapore', 'country_spain',
       'country_sweden', 'country_switzerland', 'country_usa',
       'country_united arab emirates', 'country_united kingdom',
       'country_unspecified'],
      dtype='object')


In [31]:
data['Revenue'] = data['quantity'] * data['price']

In [32]:
print(data.columns)

Index(['invoice', 'stockcode', 'description', 'quantity', 'invoicedate',
       'price', 'customer id', 'revenue', 'country_austria', 'country_bahrain',
       'country_belgium', 'country_brazil', 'country_canada',
       'country_channel islands', 'country_cyprus', 'country_czech republic',
       'country_denmark', 'country_eire', 'country_european community',
       'country_finland', 'country_france', 'country_germany',
       'country_greece', 'country_iceland', 'country_israel', 'country_italy',
       'country_japan', 'country_lebanon', 'country_lithuania',
       'country_malta', 'country_netherlands', 'country_norway',
       'country_poland', 'country_portugal', 'country_rsa',
       'country_saudi arabia', 'country_singapore', 'country_spain',
       'country_sweden', 'country_switzerland', 'country_usa',
       'country_united arab emirates', 'country_united kingdom',
       'country_unspecified', 'Revenue'],
      dtype='object')


In [33]:
# Define the features (X) and target (y)
X = data.drop(columns=['Revenue'])  # Exclude 'Revenue' column from features
y = data['Revenue']  # 'Revenue' is the target variable

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, precision_score
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('online_retail.csv', encoding='ISO-8859-1')

# Create Revenue column
data['Revenue'] = data['Quantity'] * data['Price']

# Define the features (X) and target (y)
X = data.drop(columns=['Revenue'])
y = data['Revenue']

In [35]:
# Identify non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object', 'bool']).columns

In [36]:
# Create a preprocessor for handling missing values and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # OneHotEncoder for categorical columns
        ]), non_numeric_columns),

        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numerical values
            ('passthrough', 'passthrough')  # Keep numeric columns as is
        ]), X.select_dtypes(include=['float64', 'int64']).columns)
    ]
)


In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('online_retail.csv', encoding='ISO-8859-1')

# Drop rows with critical missing values
data = data.dropna(subset=['Price', 'Quantity'])

# Fill missing Customer IDs with a default value (e.g., 0)
data['Customer ID'] = data['Customer ID'].fillna(0)

# Create Revenue column
data['Revenue'] = data['Quantity'] * data['Price']

# Sample the data for faster processing (optional, remove in production)
data = data.sample(frac=0.1, random_state=42)

# Define the features (X) and target (y)
X = data.drop(columns=['Revenue', 'Invoice', 'StockCode', 'Description', 'InvoiceDate'])
y = data['Revenue']

# Identify non-numeric columns for preprocessing
non_numeric_columns = X.select_dtypes(include=['object']).columns

# Create a preprocessor for handling missing values and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), non_numeric_columns),

        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean'))
        ]), X.select_dtypes(include=['float64', 'int64']).columns)
    ]
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that preprocesses the data and then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
explained_var = explained_variance_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Explained Variance Score: {explained_var}")


Mean Squared Error: 2422.5175776061747
Mean Absolute Error: 1.344803921025938
Explained Variance Score: 0.7209480274805941


In [38]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

In [39]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Calculate R-squared (R²) score
r2 = pipeline.score(X_test, y_test)
print(f"R-squared (R²): {r2:.2f}")

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# Calculate Precision (in this context, it might not be directly applicable for regression tasks)
# Precision is typically used for classification tasks, but let's include it for educational purposes
precision = precision_score(y_test > 0, y_pred > 0)
print(f"Precision: {precision:.2f}")

Mean Squared Error (MSE): 2422.52
R-squared (R²): 0.72
Mean Absolute Error (MAE): 1.34
Precision: 1.00


In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load data
data = pd.read_csv('online_retail.csv', encoding='latin1')

# Drop rows with critical missing values
data = data.dropna(subset=['Quantity', 'Price'])

# Calculate Revenue
data['Revenue'] = data['Quantity'] * data['Price']

# Define a binary target for classification (e.g., high revenue vs. low revenue)
threshold = data['Revenue'].median()  # Median threshold
data['HighRevenue'] = (data['Revenue'] > threshold).astype(int)

# Drop irrelevant columns
data = data.drop(columns=['Invoice', 'Description', 'InvoiceDate', 'Customer ID', 'Country'])

# Handle non-numeric data (if any remain)
# Check for non-numeric columns
non_numeric_cols = data.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {non_numeric_cols}")

# Encode non-numeric columns (if any)
if not non_numeric_cols.empty:
    for col in non_numeric_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Define features and target
X = data.drop(columns=['Revenue', 'HighRevenue'])
y = data['HighRevenue']

# Sample the data to reduce size
X, y = X.sample(frac=0.1, random_state=42), y.sample(frac=0.1, random_state=42)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model with a limit on iterations
model = SVC(kernel='linear', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")


Non-numeric columns: Index(['StockCode'], dtype='object')
Accuracy: 0.49
Precision: 0.49
Recall: 1.00
ROC AUC Score: 0.50


