# Question 3

In [41]:
import pandas as pd
import numpy as np


In [42]:
df = pd.read_csv('HR-Employee-Attrition.csv', ) 

In [43]:

# Summary statistics
print(df.describe())



               Age    DailyRate  DistanceFromHome    Education  \
count  1470.000000  1470.000000       1470.000000  1470.000000   
mean     36.923810   802.485714          9.192517     2.912925   
std       9.135373   403.509100          8.106864     1.024165   
min      18.000000   102.000000          1.000000     1.000000   
25%      30.000000   465.000000          2.000000     2.000000   
50%      36.000000   802.000000          7.000000     3.000000   
75%      43.000000  1157.000000         14.000000     4.000000   
max      60.000000  1499.000000         29.000000     5.000000   

       EnvironmentSatisfaction     JobLevel   MonthlyRate  NumCompaniesWorked  \
count              1470.000000  1470.000000   1436.000000         1470.000000   
mean                  2.721769     2.063946  14301.253482            2.693197   
std                   1.093082     1.106940   7141.338347            2.498009   
min                   1.000000     1.000000   2094.000000            0.000000   


In [44]:
# Check for missing values
print(df.isnull().sum())

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                     31
JobLevel                    0
MaritalStatus               0
MonthlyRate                34
NumCompaniesWorked          0
OverTime                    0
PerformanceRating           0
WorkLifeBalance             0
YearsAtCompany              0
dtype: int64


In [45]:
# delete missing values
df.dropna(inplace=True)

In [46]:
# Check for missing values
print(df.isnull().sum())

Age                        0
Attrition                  0
BusinessTravel             0
DailyRate                  0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EnvironmentSatisfaction    0
Gender                     0
JobLevel                   0
MaritalStatus              0
MonthlyRate                0
NumCompaniesWorked         0
OverTime                   0
PerformanceRating          0
WorkLifeBalance            0
YearsAtCompany             0
dtype: int64


In [47]:
# Define age bins and labels
age_bins = [0, 20, 30, 40, 50, float('inf')]
age_labels = ['0-20', '20-30', '30-40', '40-50', '50+']

# Cut the 'Age' column into categories
df['Age'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)
print(df['Age'])

0       40-50
1       40-50
2       30-40
3       30-40
4       20-30
        ...  
1465    30-40
1466    30-40
1467    20-30
1468    40-50
1469    30-40
Name: Age, Length: 1407, dtype: category
Categories (5, object): ['0-20' < '20-30' < '30-40' < '40-50' < '50+']


In [48]:
daily_rate_bins = [0, 500, 1000, 1500, float('inf')]  # Define the bin boundaries
daily_rate_labels = ['Low', 'Medium', 'High', 'Very High']  # Define labels for each category

# Create a new column 'DailyRateCategory' with categorical values
df['DailyRate'] = pd.cut(df['DailyRate'], bins=daily_rate_bins, labels=daily_rate_labels, right=False)
print(df['DailyRate'])

0         High
1          Low
2         High
3         High
4       Medium
         ...  
1465    Medium
1466    Medium
1467       Low
1468      High
1469    Medium
Name: DailyRate, Length: 1407, dtype: category
Categories (4, object): ['Low' < 'Medium' < 'High' < 'Very High']


In [49]:
# Define the bins and labels for 'MonthlyRate' categories
monthly_rate_bins = [0, 10000, 20000, 30000, float('inf')]  # Define the bin boundaries
monthly_rate_labels = ['Low', 'Medium', 'High', 'Very High']  # Define labels for each category

# Create a new column 'MonthlyRateCategory' with categorical values
df['MonthlyRate'] = pd.cut(df['MonthlyRate'], bins=monthly_rate_bins, labels=monthly_rate_labels, right=False)

# Print the DataFrame to see the changes
print(df[['MonthlyRate']])

     MonthlyRate
0         Medium
1           High
2            Low
3           High
4         Medium
...          ...
1465      Medium
1466        High
1467         Low
1468      Medium
1469      Medium

[1407 rows x 1 columns]


In [50]:
years_at_company_bins = [0, 2, 5, 10, float('inf')]  # Define the bin boundaries
years_at_company_labels = ['0-2', '3-5', '6-10', '11+']  # Define labels for each category

# Create a new column 'YearsAtCompanyCategory' with categorical values
df['YearsAtCompany'] = pd.cut(df['YearsAtCompany'], bins=years_at_company_bins, labels=years_at_company_labels, right=False)

# Print the DataFrame to see the changes
print(df['YearsAtCompany'] )

0       6-10
1        11+
2        0-2
3       6-10
4        3-5
        ... 
1465    6-10
1466    6-10
1467    6-10
1468    6-10
1469     3-5
Name: YearsAtCompany, Length: 1407, dtype: category
Categories (4, object): ['0-2' < '3-5' < '6-10' < '11+']


In [51]:
class OneR(object):
    
    def __init__(self):
        self.ideal_variable = None
        self.rule=None
        self.max_accuracy = 0
    
    def fit(self, X, y):
        response = list()
        result = dict()
        
        dfx = pd.DataFrame(X)
        
        for i in dfx:
            result[str(i)] = dict()
            join_data = pd.DataFrame({"v":dfx[i], "c":y})
            cross_table = pd.crosstab(join_data.v, join_data.c)
            summary = cross_table.idxmax(axis=1)
            result[str(i)] = dict(summary)
            
            counts = 0
            
            for idx, row in join_data.iterrows():
                if row['c'] == result[str(i)][row['v']]:
                    counts += 1

            accuracy = (counts/len(y))
            
            if accuracy > self.max_accuracy:
                self.max_accuracy = accuracy
                self.ideal_variable = i
                self.rule=result[str(i)]

            result_feature = {"variable": str(i), "accuracy":accuracy, "rules": result[str(i)] }  
            response.append(result_feature)
            
            
        
        return response

    
    def predict(self, X=None):
        dfx = pd.DataFrame(X)
        predicted_labels = []
        for idx, row in dfx.iterrows():
            predicted_label = self.rule.get(row[self.ideal_variable], None)
            predicted_labels.append(predicted_label)

        return predicted_labels
        
    
           
       

# k fold

In [52]:


def k_fold_cross_validation(model, data, labels, k):
    num_samples = len(data)
    fold_size = num_samples // k

    # Initialize lists to store evaluation metrics
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for i in range(k):
        # Define the indices for the current fold
        start = i * fold_size
        end = (i + 1) * fold_size
        test_indices = list(range(start, end))

        # Select the test set for the current fold
        test_data = data[test_indices]
        test_labels = labels[test_indices]

        # Select the training set for the current fold
        train_indices = list(range(0, start)) + list(range(end, num_samples))
        train_data = data[train_indices]
        train_labels = labels[train_indices]

        # Train the model on the training data
        model.fit(train_data, train_labels)

        # Make predictions on the test data
        y_pred = model.predict(test_data)

        # Calculate evaluation metrics for this fold
        accuracy = np.mean(y_pred == test_labels)
        if np.sum(y_pred == "Yes") == 0:
            precision = 0.0  # Handle the case when there are no positive predictions
            recall = 0.0
            f1 = 0.0
        else:
            precision = np.mean(y_pred[y_pred == "Yes"] == test_labels[y_pred == "Yes"])
            recall = np.mean(y_pred[y_pred == "Yes"] == test_labels[y_pred == "Yes"])
            f1 = 2 * (precision * recall) / (precision + recall)

        # Append the scores to the respective lists
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Return the mean scores across all folds
    return {
        "Accuracys": accuracy_scores,
        "Precisions": precision_scores,
        "Accuracy mean": np.mean(accuracy_scores),
        "Precision mean": np.mean(precision_scores),
        "Recall ": np.mean(recall_scores),
        "F1 Score ": np.mean(f1_scores)
    }


In [53]:
rule = OneR()
data = df.drop(['Attrition'], axis=1)  
labels = df['Attrition']

k_values = [3, 5, 7]

for k in k_values:
    result = k_fold_cross_validation(rule, data.to_numpy(), labels.to_numpy(), k)
    print("\n\n")
    print(f"Results for one R  with k={k}: \n")
    for metric, value in result.items():
        print(f"{metric}: {value}")








Results for one R  with k=3: 

Accuracys: [0.835820895522388, 0.8166311300639659, 0.8422174840085288]
Precisions: [0.0, 0.0, 0.0]
Accuracy mean: 0.8315565031982942
Precision mean: 0.0
Recall : 0.0
F1 Score : 0.0



Results for one R  with k=5: 

Accuracys: [0.8434163701067615, 0.8505338078291815, 0.7900355871886121, 0.8434163701067615, 0.8434163701067615]
Precisions: [0.0, 0.0, 0.0, 0.0, 0.0]
Accuracy mean: 0.8341637010676155
Precision mean: 0.0
Recall : 0.0
F1 Score : 0.0



Results for one R  with k=7: 

Accuracys: [0.8507462686567164, 0.845771144278607, 0.8407960199004975, 0.7761194029850746, 0.8258706467661692, 0.8557213930348259, 0.845771144278607]
Precisions: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Accuracy mean: 0.8343994314143568
Precision mean: 0.0
Recall : 0.0
F1 Score : 0.0
