### Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats.mstats

### Import dataset

In [2]:
df_train = pd.read_csv(r'C:\Users\11 PrO\Downloads\train.csv\train.csv')
df_test  = pd.read_csv(r'C:\Users\11 PrO\Downloads\test.csv\test.csv')

### Removing redundant columns

In [3]:

def remove_redundant_columns(dataset):
    # Create a set to store the names of columns already encountered
    seen = set()
    # List to store columns to drop
    to_drop = []

    # Iterate through columns
    for column in dataset.columns:
        # If we have already seen the column name, mark it for dropping
        if column in seen:
            to_drop.append(column)
        else:
            seen.add(column)

    # Drop redundant columns
    dataset.drop(to_drop, axis=1, inplace=True)
    return dataset

# Example usage:
# Load your dataset
# df = pd.read_csv('your_dataset.csv')

# Remove redundant columns
data1 = remove_redundant_columns(df_train)
data2 = remove_redundant_columns(df_test)


### checking for duplicate rows

In [4]:
import pandas as pd

# Assuming data1 is your DataFrame
duplicate_rows = data1[data1.duplicated()]

# Display duplicate rows
print("Duplicate Rows:")
print(duplicate_rows)


Duplicate Rows:
Empty DataFrame
Columns: [id, CustomerGender, CustomerType, Age, TravelPurpose, ClassTravelled, DistanceToDestination, Inflight wifi, Time convenience, WebsiteExperience, ConvenienceOfGate, Inflight Food, Online check-in, ComfortOfSeats, Inflight entertainment system, On-board service, Leg room in flight, Baggage handling ease, Checkin service, Inflight service, Cleanliness, Departure Delay in Minutes, Arrival Delay in Minutes, CustomerHappiness]
Index: []

[0 rows x 24 columns]


In [None]:
data1

### outliers

In [5]:
import numpy as np

def remove_outliers_iqr(data1, threshold=1.5):
    """
    Remove outliers from a data1set using the Interquartile Range (IQR) method.
    
    Parameters:
        data1 (numpy array or pandas data1Frame): The data1set.
        threshold (float): The threshold value for identifying outliers. Default is 1.5.
    
    Returns:
        numpy array or pandas data1Frame: The data1set with outliers removed.
    """
    # Calculate the first quartile (Q1) and third quartile (Q3)
    Q1 = np.percentile(data1, 25)
    Q3 = np.percentile(data1, 75)
    
    # Calculate the interquartile range (IQR)
    IQR = Q3 - Q1
    
    # Define the outlier cutoff (any data1 point outside this range is considered an outlier)
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    
    # Find outliers
    outliers = (data1 < lower_bound) | (data1 > upper_bound)
    
    # Remove outliers
    cleaned_data1 = data1[~outliers]
    
    return cleaned_data1

# Example usage:
# Assuming 'data1' is your data1set (a numpy array or pandas data1Frame)
cleaned_data1 = remove_outliers_iqr(data1)


TypeError: '<' not supported between instances of 'str' and 'int'

### Check for missing values and fill them

In [6]:
def columns_with_missing_values(df):
    # Check for missing values in each column
    missing_values = df.isnull().sum()

    # Filter out columns with missing values
    missing_columns = missing_values[missing_values > 0]

    if missing_columns.empty:
        return []
    else:
        return missing_columns.index.tolist()


columns_with_missing = columns_with_missing_values(data1)
print("Columns with missing values in train set:", columns_with_missing)
columns_with_missing = columns_with_missing_values(data2)
print("Columns with missing values in test set:", columns_with_missing)



Columns with missing values in train set: ['Arrival Delay in Minutes']
Columns with missing values in test set: ['Arrival Delay in Minutes']


In [7]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Assuming data1 is your DataFrame
column_name = 'Arrival Delay in Minutes'

# Create a SimpleImputer and fit it to your data
imputer = SimpleImputer(strategy='mean')  # You can choose a different strategy if needed
imputer.fit(data1[[column_name]])

# Use the imputer to fill missing values in the specified column
data1[column_name] = imputer.transform(data1[[column_name]])

# Remove duplicate rows based on the specified column
data1_no_duplicates = data1.drop_duplicates(subset=[column_name])

# Display the resulting DataFrame
print("DataFrame without null values and duplicate rows in the specified column:")
print(data1_no_duplicates)


DataFrame without null values and duplicate rows in the specified column:
            id CustomerGender CustomerType  Age TravelPurpose  ClassTravelled  \
0        70172           Male     Frequent   13    Recreation  PremiumEconomy   
1         5047           Male   Occasional   25      Business        Business   
2       110028         Female     Frequent   26      Business        Business   
3        24026         Female     Frequent   25      Business        Business   
6        82113           Male     Frequent   47    Recreation         Economy   
...        ...            ...          ...  ...           ...             ...   
99898    79335         Female     Frequent   40      Business        Business   
100909    5921           Male     Frequent   41    Recreation         Economy   
101096  113935         Female     Frequent   52      Business        Business   
101920    5702         Female     Frequent    7    Recreation         Economy   
102609   66787         Female     F

In [8]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Assuming data1 is your DataFrame
column_name = 'Arrival Delay in Minutes'

# Create a SimpleImputer and fit it to your data
imputer = SimpleImputer(strategy='mean')  # You can choose a different strategy if needed
imputer.fit(data2[[column_name]])

# Use the imputer to fill missing values in the specified column
data2[column_name] = imputer.transform(data2[[column_name]])

# Remove duplicate rows based on the specified column
data2_no_duplicates = data2.drop_duplicates(subset=[column_name])

# Display the resulting DataFrame
print("DataFrame without null values and duplicate rows in the specified column:")
print(data2_no_duplicates)



DataFrame without null values and duplicate rows in the specified column:
           id CustomerGender CustomerType  Age TravelPurpose ClassTravelled  \
0       19556         Female     Frequent   52      Business        Economy   
1       90035         Female     Frequent   36      Business       Business   
3       77959           Male     Frequent   44      Business       Business   
4       36875         Female     Frequent   49      Business        Economy   
7       97286         Female     Frequent   43      Business       Business   
...       ...            ...          ...  ...           ...            ...   
25218   60214         Female   Occasional   43      Business       Business   
25289  107748           Male     Frequent   66    Recreation        Economy   
25356   13844         Female     Frequent   10    Recreation        Economy   
25574   16776           Male     Frequent   63    Recreation        Economy   
25771   15343           Male   Occasional   44      Busin

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=4)

data1['Arrival Delay in Minutes'] = imputer.fit_transform(data1[['Arrival Delay in Minutes']])
data2['Arrival Delay in Minutes'] = imputer.fit_transform(data2[['Arrival Delay in Minutes']])

In [6]:
from sklearn.impute import KNNImputer
impute=KNNImputer()

for i in  df_train.select_dtypes(include="number").columns:
    data1[i]=impute.fit_transform(data1[[i]])


In [7]:
for i in  df_test.select_dtypes(include="number").columns:
    data2[i]=impute.fit_transform(data2[[i]])

In [8]:
for i in ['Arrival Delay in Minutes']:
    data1[i].fillna(data1[i].mode()[0],inplace=True)

In [9]:
for i in ['Arrival Delay in Minutes']:
    data2[i].fillna(data2[i].mode()[0],inplace=True)

### Encoding categorical features

In [9]:
# fetch categorical features
def get_categorical_features(df):
    # Select columns with dtype 'object' (strings) or 'category'
    categorical_features = data1.select_dtypes(include=['object', 'category']).columns.tolist()
    return categorical_features

categorical_features = get_categorical_features(data1)
print("Categorical features:", categorical_features)


Categorical features: ['CustomerGender', 'CustomerType', 'TravelPurpose', 'ClassTravelled', 'CustomerHappiness']


In [10]:
#ENCODING THE FEATURES
from sklearn.preprocessing import LabelEncoder

def label_encode_categorical_features(df, categorical_features):
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Iterate through each categorical feature and encode its values
    for feature in categorical_features:
        data1[feature] = label_encoder.fit_transform(data1[feature])

    return df

# Get categorical features
categorical_features = get_categorical_features(data1)

# Label encode categorical features
data1 = label_encode_categorical_features(data1, categorical_features)
# Now, categorical features in df have been label encoded



In [None]:
data1

In [11]:
#test
# fetch categorical features
def get_categorical_features(df):
    # Select columns with dtype 'object' (strings) or 'category'
    categorical_features = data2.select_dtypes(include=['object', 'category']).columns.tolist()
    return categorical_features

categorical_features = get_categorical_features(data2)
print("Categorical features:", categorical_features)


Categorical features: ['CustomerGender', 'CustomerType', 'TravelPurpose', 'ClassTravelled']


In [12]:
#ENCODING THE FEATURES
from sklearn.preprocessing import LabelEncoder

def label_encode_categorical_features(df, categorical_features):
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Iterate through each categorical feature and encode its values
    for feature in categorical_features:
        df[feature] = label_encoder.fit_transform(df[feature])

    return df

# Get categorical features
categorical_features = get_categorical_features(data2)

# Label encode categorical features
df2 = label_encode_categorical_features(data2, categorical_features)
# Now, categorical features in df have been label encoded


In [None]:
df2 #test 

In [None]:
import matplotlib.pyplot as plt

# Assuming 'data' is your dataset (a pandas DataFrame or numpy array)

# Create a box plot to visualize the distribution of each feature
plt.figure(figsize=(10, 6))
plt.boxplot(df1)
plt.xticks(range(1, len(df1.columns) + 1), df1.columns)
plt.xlabel('Features')
plt.ylabel('Values')
plt.title('Box Plot of Features')
plt.show()


### fill missing values

In [13]:
def columns_with_missing_values(data2):
    # Check for missing values in each column
    missing_values = data1.isnull().sum()

    # Filter out columns with missing values
    missing_columns = missing_values[missing_values > 0]

    if missing_columns.empty:
        return []
    else:
        return missing_columns.index.tolist()


columns_with_missing = columns_with_missing_values(df2)
print("Columns with missing values:", columns_with_missing)



Columns with missing values: []


### allocating X and y

In [14]:
# Step 1: Split the dataset into features (X) and the target variable (y)
X = data1.drop(columns=['id','CustomerHappiness'])  # Features
y = data1['CustomerHappiness']  # Target variable
#X, data2 = X.align(data2, join='outer', axis=1, fill_value=0)
# Step 2: Split the data into training and testing sets
X1 = df2.drop(columns=['id'])


### standard scaler

In [15]:
import numpy as np
from sklearn.preprocessing import StandardScaler


# Reshape to a two-dimensional array
# Initialize StandardScaler
import numpy as np
from sklearn.preprocessing import StandardScaler


# Reshape to a two-dimensional array
# Initialize StandardScaler
scaler = StandardScaler()
# Normalize the features (X)

X_scaled = scaler.fit_transform(X)  # Reshape y to a 2D array
X_scaled1 =scaler.fit_transform(X1)
# Display the normalized features (X)
print("Normalized Features:")
print(X_scaled)





# Replace existing X and y features in df1 with scaled features
data1[X.columns] = X_scaled
df2[X1.columns] = X_scaled1


Normalized Features:
[[ 1.01503056 -0.4727667  -1.7452793  ...  1.30586973  0.26639265
   0.07301421]
 [ 1.01503056  2.11520819 -0.95136024 ... -1.74229153 -0.36137482
  -0.23753899]
 [-0.98519201 -0.4727667  -0.88520032 ...  1.30586973 -0.3875318
  -0.39281559]
 ...
 [ 1.01503056  2.11520819 -0.62056063 ...  0.54382941 -0.20443295
  -0.03050353]
 [-0.98519201  2.11520819 -1.14984    ... -1.74229153 -0.3875318
  -0.39281559]
 [ 1.01503056 -0.4727667  -0.8190404  ... -1.74229153 -0.3875318
  -0.39281559]]


In [None]:
df2

### Correlation matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have a DataFrame named 'data'
# If not, replace it with your DataFrame containing numerical columns

# Generate a correlation matrix
correlation_matrix = data1.corr()

# Set up the matplotlib figure
plt.figure(figsize=(30,40))

# Plot the heatmap using Seaborn
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

# Show the plot
plt.title('Correlation Matrix')
plt.show()


### run this pls

In [18]:

#X = data1.drop(columns=['CustomerHappiness'])  # Features
y = data1['CustomerHappiness']  # Target variable
X, df2 = X.align(df2, join='outer', axis=1)
# Step 2: Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


### Smapling

In [None]:
import pandas as pd

# Assuming act_df is your DataFrame with 'attack_cat' column representing class attack_cats

# Check the unique values in the 'attack_cat' column
unique_classes = data1['CustomerHappiness'].unique()

# Print the unique classes
print("Unique Classes:", unique_classes)

# Check the number of records for each class
class_counts = data1['CustomerHappiness'].value_counts()
print("Class Counts:")
print(class_counts)

In [None]:
import pandas as pd

# Assuming act_df is your DataFrame with 'attack_cat' column representing class attack_cats

# Function to sample 50,000 rows from each group without replacement if the group size is sufficient
def sample_rows(group):
    sample_size = min(50000, len(group))
    return group.sample(n=sample_size, replace=sample_size < 50000, random_state=0)

# Apply the sampling function to each group (label)
sampled_df = data1.groupby('CustomerHappiness', group_keys=False).apply(sample_rows)

# Reset the index of the sampled DataFrame
sampled_df.reset_index(drop=True, inplace=True)

# Display the sampled DataFrame
print(sampled_df)


In [None]:
duplicates = sampled_df[sampled_df.duplicated()]
print("Duplicate rows:")
print(duplicates)


In [None]:
import pandas as pd

# Assuming sampled_df is your DataFrame
# Remove entirely identical rows
sampled_df = sampled_df.drop_duplicates()

# Reset the index of the DataFrame
sampled_df.reset_index(drop=True, inplace=True)

# Display the DataFrame without duplicates
print(sampled_df)


In [None]:
# Step 1: Split the dataset into features (X) and the target variable (y)
X = sampled_df.drop(columns=['CustomerHappiness'])  # Features
y = sampled_df['CustomerHappiness']  # Target variable
X, data2 = X.align(data2, join='outer', axis=1, fill_value=0)
# Step 2: Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### model

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


# Define the base classifiers
rf_clf = RandomForestClassifier(random_state=42)
xgb_clf = XGBClassifier(random_state=42)

# Define the VotingClassifier
super_clf1 = VotingClassifier(estimators=[('rf', rf_clf), ('xgb', xgb_clf)], voting='hard')

# Train the super classifier
super_clf1.fit(X_train, y_train)

# Make predictions
y_pred = super_clf1.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print("F1 Score:", f1)


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# Define the individual classifiers
clf1 = DecisionTreeClassifier(random_state=42)
clf2 = KNeighborsClassifier()
clf3 = SVC(probability=True)

# Define the ensemble classifier using hard voting
voting_clf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='hard')

# Train the ensemble classifier
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# Import necessary libraries

from sklearn.metrics import f1_score, classification_report
import xgboost  as xgb
import numpy as np

# Generate some random data for demonstration purposes

# Create an XGBoost classifier

# Create an XGBoost model with adjusted parameters
model1 = xgb.XGBClassifier(
    objective="binary:logistic",
    max_depth=5,
    min_child_weight=1,  # Adjusted this parameter
    gamma=0.01,  # Removed gamma for simplicity, you can experiment with it
    random_state=42,
    colsample_bytree=0.8,
    learning_rate=0.1,
    n_estimators=600,
    subsample=0.8,
    reg_alpha=0.75,  # Adjusted regularization parameters
    reg_lambda=0.25
)


# Train the model on the training data
model1.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model1.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Print F1 score and classification report
print(f"F1 Score: {f1}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


# Define the XGBoost classifier
xgb = XGBClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# Define the evaluation metric (F1 score in this case)
scoring_metric = 'f1'

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring=scoring_metric, cv=3)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the final model with the best hyperparameters on the entire training set
final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = final_model.predict(X_test)

# Evaluate the final model using F1 score
f1 = f1_score(y_test, y_pred)
print("Validation F1 Score:", f1)


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report



# Split the data into training and testing sets

# Initialize Extra Trees Classifier
extra_trees_model = ExtraTreesClassifier(n_estimators=300, random_state=42, criterion='gini',max_depth=None,min_samples_split=5,min_samples_leaf=1,bootstrap=True,n_jobs=None)

# Train the model
extra_trees_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = extra_trees_model.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Display F1 score and classification report
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

# Sample data (replace this with your dataset)

# Initialize Random Forest Classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest_model.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Display F1 score and classification report
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report


# Initialize Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Train the model
decision_tree_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = decision_tree_model.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Display F1 score and classification report
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


### hardvoting=xg+lgb+extra trees

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.metrics import f1_score
import lightgbm as lgb

# Assuming X_train, y_train, X_test, y_test are your training and testing data

# Initialize individual classifiers with best parameters
etc_model = ExtraTreesClassifier(n_estimators=300, max_depth=None, min_samples_split=5, min_samples_leaf=1)
xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1)
lgb_model = lgb.LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, num_leaves=31)

# Create a voting classifier
voting_classifier = VotingClassifier(estimators=[
    ('etc', etc_model),
    ('xgb', xgb_model),
    ('lgb', lgb_model)
], voting='hard')

# Train the voting classifier
voting_classifier.fit(X_train, y_train)

# Make predictions
ensemble_pred = voting_classifier.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, ensemble_pred)

print("F1 Score of Super Classifier (Hard Voting):", f1)


### hyperparameter tuning of light gbm

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define the classifier
lgb_model = lgb.LGBMClassifier()

# Define the hyperparameters grid to search
param_grid = {
    'num_leaves': [20, 30, 40],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'min_child_samples': [5, 10, 20],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5],
    'reg_lambda': [0.0, 0.1, 0.5]
}

# Define the scoring metric
scorer = make_scorer(f1_score)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring=scorer, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best F1 Score:", best_score)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score, classification_report


# Initialize base models
base_models = [
    ('random_forest', RandomForestClassifier(n_estimators=200, random_state=42)),
    ('xgboost', XGBClassifier(learning_rate=1.0, n_estimators=200, random_state=42))
]

# Initialize StackingClassifier with a meta-classifier (Random Forest in this case)
stacking_model = StackingClassifier(estimators=base_models, final_estimator=RandomForestClassifier(random_state=42))

# Train the model on the training set
stacking_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_stacked = stacking_model.predict(X_test)

# Calculate F1 score for the stacked model
f1_stacked = f1_score(y_test, y_pred_stacked, average='weighted')

# Display F1 score and classification report for the stacked model
print("Stacked Model:")
print(f"F1 Score: {f1_stacked:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_stacked))


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score, classification_report


# Initialize base models
base_models = [
    ('adaboost', AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42)),
    ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42)),
]

# Initialize StackingClassifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=RandomForestClassifier(random_state=42))

# Train the model on the training set
stacking_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = stacking_model.predict(X_test)

# Calculate F1 score on the test set
f1 = f1_score(y_test, y_pred, average='weighted')

# Display F1 score and classification report
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


### ensemble with three classifiers+k fold+regulariztaion

In [17]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np

# Assuming X, y are your features and target variable
# Define the number of folds for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Define base models
base_models = [
    ('xgb', xgb.XGBClassifier(n_estimators=100, max_depth=5, reg_alpha=0.1, reg_lambda=0.1)),
    ('lgbm', lgb.LGBMClassifier(n_estimators=100, max_depth=5, reg_alpha=0.1, reg_lambda=0.1)),
    ('etc', ExtraTreesClassifier(n_estimators=100, max_depth=5))
]

# Define meta classifier
meta_model = xgb.XGBClassifier(n_estimators=100, max_depth=5)

# Initialize empty lists to store F1 scores for each fold
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialize stacking classifier
    stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)
    
    # Train stacking classifier
    stacking_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = stacking_model.predict(X_test)
    
    # Calculate F1 score
    f1 = f1_score(y_test, y_pred)
    f1_scores.append(f1)

# Calculate average F1 score across all folds
average_f1_score = np.mean(f1_scores)

print("Average F1 Score of Stacking Classifier with {}-fold cross-validation: {:.4f}".format(n_splits, average_f1_score))


KeyError: "None of [Index([     0,      1,      2,      3,      4,      5,      6,      7,      9,\n           10,\n       ...\n       103890, 103891, 103892, 103893, 103895, 103897, 103898, 103900, 103901,\n       103902],\n      dtype='int32', length=83123)] are in the [columns]"

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Define the LightGBM model for binary classification
lgb_model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42)

# Train the model
lgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = lgb_model.predict(X_test)

# Calculate F1 Score on the Test Set
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, classification_report



# Initialize CatBoost Classifier
catboost_model = CatBoostClassifier(iterations=160, depth=10, learning_rate=1.0, loss_function='MultiClass', random_state=42)

# Train the model on the training set
catboost_model.fit(X_train, y_train, verbose=100)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Calculate F1 score on the test set
f1 = f1_score(y_test, y_pred, average='weighted')

# Display F1 score and classification report
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
num_columns = submission.shape[1]

# Display the number of columns
print("Number of Columns:", num_columns)

In [None]:
features_list = data2.columns.tolist()

# Display the list of features
print("List of Features:", features_list)

### submission

In [None]:
test_predictions = lgb_model.predict(df2)
print(test_predictions)

In [None]:
import pandas as pd
import pandas as pd

# Assuming df is your DataFrame
# Drop duplicate rows
df_no_duplicates = submission.drop_duplicates()

# Display the resulting DataFrame
print("Original DataFrame:")
print(submission)

print("\nDataFrame without duplicate rows:")
print(df_no_duplicates)



In [None]:
import pandas as pd

# Assuming df is your DataFrame
num_rows = df_no_duplicates.shape[0]

print(f"Number of rows in the DataFrame: {num_rows}")


In [None]:
data2['CustomerHappiness'] = test_predictions
data2['CustomerHappiness'] = data2['CustomerHappiness'].replace({0: 'Unhappy', 1: 'Happy'})
#data2['CustomerHappiness'] = data2['CustomerHappiness'].replace({'Happy': 'Unhappy', 'Unhappy': 'Happy'})
submission = data2.drop(columns=[ 'CustomerGender', 'CustomerType', 'Age', 'TravelPurpose', 'ClassTravelled', 'DistanceToDestination', 'Inflight wifi', 'Time convenience', 'WebsiteExperience', 'ConvenienceOfGate', 'Inflight Food', 'Online check-in', 'ComfortOfSeats', 'Inflight entertainment system', 'On-board service', 'Leg room in flight', 'Baggage handling ease', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',])
submission

In [None]:
import pandas as pd

# Assuming 'submission' is your DataFrame and 'column_name' is the column you want to check
column_name = 'id'


# Check if there are duplicate values in the specified column
duplicates = submission.duplicated(subset=[column_name])

# Print rows with duplicate values in the specified column
print(submission[duplicates])


In [None]:
submission.info()

In [None]:
import pandas as pd

# Assuming data1 is your DataFrame
duplicate_rows = submission[submission.duplicated()]

# Display duplicate rows
print("Duplicate Rows:")
print(duplicate_rows)

In [None]:
submission.to_csv("submit7.csv")

In [None]:
import pandas as pd

# Assuming data1 is your DataFrame and 'column_name' is the column with duplicate values
column_name = 'id'

# Remove rows with duplicate values in the specified column
data1_no_duplicates = submission.drop_duplicates(subset=[column_name])

# Display the resulting DataFrame
print("DataFrame without duplicate rows in the specified column:")
print(data1_no_duplicates)


In [None]:
output = pd.DataFrame({'id': submission.id.astype('Int32'),
                       'CustomerHappiness': submission["CustomerHappiness"]})

# Step 3: Save the DataFrame to a CSV file
output.to_csv('submission10.csv.csv', index=False)

### Learning curves

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Assume you have your features (X) and labels (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set up parameters for LightGBM
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'num_boost_round': 1000,  # You can adjust the number of boosting rounds
}

# Train the LightGBM model with early stopping
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    early_stopping_rounds=10,
    verbose_eval=True
)

# Predict on the test set
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_class)
print(f"Test Accuracy: {accuracy}")

# Plot the training and validation performance
plt.plot(range(1, model.best_iteration + 1), model.eval_train()['binary_error'], label='Training Error')
plt.plot(range(1, model.best_iteration + 1), model.eval_valid()['binary_error'], label='Validation Error')
plt.xlabel('Boosting Round')
plt.ylabel('Error')
plt.title('LightGBM Model Training and Validation Error')
plt.legend()
plt.show()


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

# Assume you have your features (X) and labels (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an Extra Trees model with adjusted parameters
model = ExtraTreesClassifier(
    n_estimators=150,
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',  # Use 'sqrt' instead of 'auto'
    bootstrap=False,
    random_state=42,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None
)

# Lists to store training and validation errors
train_errors = []
valid_errors = []

# Training loop
for epoch in range(1, 100):  # Adjust the number of epochs as needed
    model.fit(X_train, y_train)
    
    # Training error
    train_pred = model.predict(X_train)
    train_error = 1 - np.mean(train_pred == y_train)
    train_errors.append(train_error)
    
    # Validation error
    valid_pred = model.predict(X_test)
    valid_error = 1 - np.mean(valid_pred == y_test)
    valid_errors.append(valid_error)

# Plotting the training and validation errors
epochs = np.arange(1, 100)  # Adjust the number of epochs as needed
plt.plot(train_errors, label='Training Error')
plt.plot(valid_errors, label='Validation Error')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('Training and Validation Errors over Epochs')
plt.legend()
plt.show()


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Assume you have your features (X) and labels (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost model with adjusted parameters
model = xgb.XGBClassifier(
    objective="binary:logistic",
    max_depth=5,
    min_child_weight=1,  # Adjusted this parameter
    gamma=0.01,  # Removed gamma for simplicity, you can experiment with it
    random_state=42,
    colsample_bytree=0.8,
    learning_rate=0.1,
    n_estimators=150,
    subsample=0.8,
    reg_alpha=0.25,  # Adjusted regularization parameters
    reg_lambda=0.25
)

# Specify the evaluation dataset
eval_set = [(X_train, y_train), (X_test, y_test)]

# Set up early stopping
early_stopping_rounds = 10
eval_metric = ["error", "logloss"]

# Train the model with early stopping
model.fit(
    X_train, y_train,
    eval_metric=eval_metric,
    eval_set=eval_set,
    early_stopping_rounds=early_stopping_rounds,
    verbose=True
)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

# Plot the training and validation performance
results = model.evals_result()
epochs = len(results['validation_0']['error'])

plt.plot(range(1, epochs + 1), results['validation_0']['error'], label='Training Error')
plt.plot(range(1, epochs + 1), results['validation_1']['error'], label='Validation Error')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('XGBoost Model Training and Validation Error')
plt.legend()
plt.show()
