In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df = pd.get_dummies(df, columns=['Party'])
one_hot_columns = [col for col in df.columns if col.startswith('Party_')]
df[one_hot_columns] = df[one_hot_columns].astype(int)
df.drop(df.columns[2], axis=1, inplace=True)


# Display the DataFrame after one-hot encoding
df.head(10)


In [None]:
def convert_to_numeric(str):
    # Convert string representation to numeric value
    if 'crore+' in str.lower():
        value = float(str.lower().replace(' crore+', '')) * 10**7
    elif 'lac+' in str.lower():
        value = float(str.lower().replace(' lac+', '')) * 10**5
    elif 'thou+' in str.lower():
        value = float(str.lower().replace(' thou+', '')) * 10**3
    elif 'hund+' in str.lower():
        value = float(str.lower().replace(' hund+', '')) * 10**2
    else:
        value = float(str)
    return int(value)  # Convert to integer

# Apply the function to 'Total Assets' and 'Liabilities' columns
df['Total Assets'] = df['Total Assets'].apply(convert_to_numeric)
df['Liabilities'] = df['Liabilities'].apply(convert_to_numeric)

In [None]:
df = pd.get_dummies(df, columns=['state'])
one_hot_columns = [col for col in df.columns if col.startswith('state_')]
df[one_hot_columns] = df[one_hot_columns].astype(int)


In [None]:
diff_level_of_educations = df['Education'].unique()
print(diff_level_of_educations)

In [None]:
education_mapping = {
    'Others': 0,
    '5th Pass': 2,
    'Literate': 1,
    'Doctorate': 9,
    '10th Pass': 4,
    'Graduate': 6,
    'Graduate Professional': 7,
    'Post Graduate': 8,
    '12th Pass': 5,
    '8th Pass': 3
}

# Label encode the 'Education' column
df['Education'] = df['Education'].map(education_mapping)


In [None]:
df.head(10)

In [None]:
# Drop the 'ID' and 'Candidate' columns
df.drop(['ID', 'Candidate'], axis=1, inplace=True)



In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

#from sklearn.impute import SimpleImpute

In [None]:
df.head(10)

In [None]:
X = df.drop('Education', axis=1)
y = df['Education']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = BernoulliNB()

# Train the SVC model
model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = model.predict(X_test)


# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
print(y_pred)

In [None]:
df = pd.read_csv('test.csv')


In [None]:
df = pd.get_dummies(df, columns=['Party'])
one_hot_columns = [col for col in df.columns if col.startswith('Party_')]
df[one_hot_columns] = df[one_hot_columns].astype(int)


df.drop(df.columns[2], axis=1, inplace=True)

# Convert string representation to numeric value
def convert_to_numeric(s):
    if 'crore+' in s.lower():
        value = float(s.lower().replace(' crore+', '')) * 10**7
    elif 'lac+' in s.lower():
        value = float(s.lower().replace(' lac+', '')) * 10**5
    elif 'thou+' in s.lower():
        value = float(s.lower().replace(' thou+', '')) * 10**3
    elif 'hund+' in s.lower():
        value = float(s.lower().replace(' hund+', '')) * 10**2
    else:
        value = float(s)  # If no suffix, assume it's already a numeric value
    return int(value)  # Convert to integer

# Apply the function to 'Total Assets' and 'Liabilities' columns
df['Total Assets'] = df['Total Assets'].apply(convert_to_numeric)
df['Liabilities'] = df['Liabilities'].apply(convert_to_numeric)

df = pd.get_dummies(df, columns=['state'])
one_hot_columns = [col for col in df.columns if col.startswith('state_')]
df[one_hot_columns] = df[one_hot_columns].astype(int)

In [None]:
X_test = df.drop(['ID', 'Candidate'], axis=1)  # Assuming 'ID' and 'Candidate' columns are dropped
expected_education = model.predict(X_test)

In [None]:
education_mapping = {
    0: 'Others',
    1: 'Literate',
    2: '5th Pass',
    3: '8th Pass',
    4: '10th Pass',
    5: '12th Pass',
    6: 'Graduate',
    7: 'Graduate Professional',
    8: 'Post Graduate',
    9: 'Doctorate'
}

# Map the expected education labels to their corresponding labels in the dictionary
expected_education_labels = [education_mapping[label] for label in expected_education]

# Create a DataFrame containing the ID and expected education columns
submission = pd.DataFrame({'ID': df['ID'], 'Education': expected_education_labels})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)