In [1]:
import pandas as pd
from faker import Faker
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
# Initialize Faker and seed for reproducibility
fake = Faker()
random.seed(42)

# Dummy Data Generation
num_users = 100
users_data = {'user_id': range(1, num_users + 1),
              'age': [random.randint(18, 60) for _ in range(num_users)],
              'gender': [fake.random_element(elements=('Male', 'Female')) for _ in range(num_users)],
              'education_level': [fake.random_element(elements=('High School', 'Bachelor', 'Master', 'PhD')) for _ in range(num_users)]}

users_df = pd.DataFrame(users_data)
users_df.to_csv('users.csv', index=False)

num_courses = 50
courses_data = {'course_id': range(1, num_courses + 1),
                'course_title': [fake.catch_phrase() for _ in range(num_courses)],
                'difficulty': [fake.random_element(elements=('Beginner', 'Intermediate', 'Advanced')) for _ in range(num_courses)],
                'category': [fake.random_element(elements=('Programming', 'Data Science', 'Design')) for _ in range(num_courses)],
                'duration': [random.randint(1, 30) for _ in range(num_courses)],
                'course_description': [fake.text() for _ in range(num_courses)]}

courses_df = pd.DataFrame(courses_data)
courses_df.to_csv('courses.csv', index=False)

education_data = {'user_id': [random.randint(1, num_users) for _ in range(num_users)],
                  'course_id': [random.randint(1, num_courses) for _ in range(num_users)],
                  'course_taken': [random.choice([0, 1]) for _ in range(num_users)]}

education_df = pd.DataFrame(education_data)
education_df.to_csv('education.csv', index=False)

# print(education_df)
# print(courses_df)
print(users_df)



    user_id  age  gender education_level
0         1   58    Male          Master
1         2   25  Female          Master
2         3   19    Male             PhD
3         4   35    Male          Master
4         5   33    Male             PhD
..      ...  ...     ...             ...
95       96   38  Female             PhD
96       97   43  Female        Bachelor
97       98   35    Male             PhD
98       99   22    Male     High School
99      100   31    Male          Master

[100 rows x 4 columns]


In [2]:
# Data Loading and Merging
users_df = pd.read_csv('users.csv')
courses_df = pd.read_csv('courses.csv')
education_df = pd.read_csv('education.csv')

merged_df = pd.merge(education_df, users_df, on='user_id')
merged_df = pd.merge(merged_df, courses_df, on='course_id')
print(merged_df)
print(merged_df)

    user_id  course_id  course_taken  age  gender education_level  \
0        33          4             0   39    Male          Master   
1        71          4             0   41    Male     High School   
2        21          4             1   53    Male          Master   
3         1          4             0   58    Male          Master   
4        78          4             0   59  Female        Bachelor   
..      ...        ...           ...  ...     ...             ...   
95       32         19             1   45    Male             PhD   
96       52         45             1   53    Male             PhD   
97       57         46             0   54  Female     High School   
98       58         10             0   30  Female     High School   
99       76         21             1   60  Female             PhD   

                                      course_title    difficulty  \
0        Face-to-face bandwidth-monitored approach      Beginner   
1        Face-to-face bandwidth-mon

In [3]:
# Feature Engineering
user_features = ['age', 'gender', 'education_level']
course_features = ['difficulty', 'category', 'duration']

features = user_features + course_features
X = merged_df[features]
y = merged_df['course_taken']
print(X)
print(y)



    age  gender education_level    difficulty      category  duration
0    39    Male          Master      Beginner  Data Science        23
1    41    Male     High School      Beginner  Data Science        23
2    53    Male          Master      Beginner  Data Science        23
3    58    Male          Master      Beginner  Data Science        23
4    59  Female        Bachelor      Beginner  Data Science        23
..  ...     ...             ...           ...           ...       ...
95   45    Male             PhD      Beginner   Programming        18
96   53    Male             PhD      Beginner   Programming         3
97   54  Female     High School  Intermediate  Data Science        13
98   30  Female     High School  Intermediate   Programming        29
99   60  Female             PhD      Advanced   Programming         9

[100 rows x 6 columns]
0     0
1     0
2     1
3     0
4     0
     ..
95    1
96    1
97    0
98    0
99    1
Name: course_taken, Length: 100, dtype: int64


In [4]:
# Model Training
X_encoded = pd.get_dummies(X, columns=['gender', 'education_level', 'difficulty', 'category'])
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy}')

Model Accuracy: 0.55


In [6]:
# Modify the recommendation function
def get_recommendations(user_info):
    user_info_df = pd.DataFrame(user_info, index=[0])

    # Ensure columns are consistent with the model's feature names
    user_info_encoded = pd.get_dummies(user_info_df, columns=['gender', 'education_level', 'difficulty', 'category'])
    user_info_encoded = user_info_encoded.reindex(columns=X_train.columns, fill_value=0)

    # Make predictions for the new user
    prediction = model.predict(user_info_encoded)

    if prediction == 1:
        # Print the merged dataframe for the user
#         print("Merged Data for the User:")
#         print(merged_df[merged_df['user_id'] == user_info['user_id']])

        user_courses = merged_df[merged_df['user_id'] == user_info['user_id']]['course_id']

        if not user_courses.empty:
            recommendations = []

            for course_id in user_courses:
                # Check if the course information is available
                if not pd.isnull(merged_df.loc[merged_df['course_id'] == course_id, 'course_title']).all():
                    # Calculate cosine similarity directly
                    sim_scores = cosine_similarity([user_info_encoded.iloc[0]], [X_encoded.loc[course_id]])[0]
                    sim_scores = sorted(list(enumerate(sim_scores)), key=lambda x: x[1], reverse=True)
                    sim_scores = sim_scores[1:4]
                    course_indices = [i[0] for i in sim_scores]
                    recommendations.extend(courses_df['course_title'].iloc[course_indices])
                else:
                    print(f"Missing information for course_id {course_id}")

            return recommendations
        else:
            return "No courses taken by the user, cannot provide recommendations."
    else:
        return "User not likely to take a course."
# Example Usage with User Information likely to take a course
new_user_info_positive = {'user_id': 33, 'age': 39, 'gender': 'Male', 'education_level': 'High School',
                           'difficulty': 'Advanced', 'category': 'Data Science', 'duration': 23,
                           'course_description': fake.text()}

# Ensure the 'user_id' is valid
if new_user_info_positive['user_id'] in users_df['user_id'].values:

    # Fill missing values for the new user with reasonable defaults
    new_user_info_positive['course_title'] = 'Default Course Title'
    new_user_info_positive['difficulty'] = 'Intermediate'
    new_user_info_positive['category'] = 'Default Category'
    new_user_info_positive['duration'] = 10  # Use a reasonable default duration
    new_user_info_positive['course_description'] = 'Default Course Description'

    recommended_courses_positive = get_recommendations(new_user_info_positive)
    print(f'Recommended Courses: {recommended_courses_positive}')

else:
    print(f"Invalid 'user_id' {new_user_info_positive['user_id']}. Choose a valid user_id.")




Recommended Courses: User not likely to take a course.
