### import libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
import json
import string

### load dataset

In [2]:
df = pd.read_csv('../HomeworkThree/dataset.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49788 entries, 0 to 2126
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   course_name               49242 non-null  object
 1   course_instructor_site    48599 non-null  object
 2   course_site               46177 non-null  object
 3   course_instructor         46113 non-null  object
 4   course_cost               22373 non-null  object
 5   course_credential         35877 non-null  object
 6   course_level              21285 non-null  object
 7   course_duration           45048 non-null  object
 8   course_language           46125 non-null  object
 9   course_caption_languages  18959 non-null  object
 10  overview                  47648 non-null  object
 11  syllabus                  24037 non-null  object
 12  subject                   49162 non-null  object
dtypes: object(13)
memory usage: 5.3+ MB


#### detect free courses and extract numbers

In [3]:
def process_cost(cost):
    cost = str(cost)
    cost = cost.replace(',', '')
    if cost[0] == '$':
        cost = int(cost.split('.')[0][1:])
    elif 'Free' in cost or 'free' in cost:
        cost = 0
    else:
        cost = None

    return cost

In [4]:
df['course_credential'] = df['course_credential'].apply(process_cost)
df['course_credential'].fillna((df['course_credential'].mean()), inplace=True)

#### turn subtitle string into list of words

In [5]:
def str2list(text):
    text = str(text)
    if text == 'nan' or text == '':
        return []
    text = text.split(', ')
    return text

In [6]:
df['course_caption_languages'] = df['course_caption_languages'].apply(str2list)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49788 entries, 0 to 2126
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   course_name               49242 non-null  object 
 1   course_instructor_site    48599 non-null  object 
 2   course_site               46177 non-null  object 
 3   course_instructor         46113 non-null  object 
 4   course_cost               22373 non-null  object 
 5   course_credential         49788 non-null  float64
 6   course_level              21285 non-null  object 
 7   course_duration           45048 non-null  object 
 8   course_language           46125 non-null  object 
 9   course_caption_languages  49788 non-null  object 
 10  overview                  47648 non-null  object 
 11  syllabus                  24037 non-null  object 
 12  subject                   49162 non-null  object 
dtypes: float64(1), object(12)
memory usage: 5.3+ MB


In [8]:
# df.to_csv('../HomeworkThree/dataset_processed.csv')
df.head(10)

Unnamed: 0,course_name,course_instructor_site,course_site,course_instructor,course_cost,course_credential,course_level,course_duration,course_language,course_caption_languages,overview,syllabus,subject
0,Machine Learning,Stanford University,Coursera,Andrew Ng,Free Online Course (Audit),99.972306,,"11 weeks long, 60 hours worth of material",English,"[Arabic, French, Portuguese, Chinese, Italian,...",Machine learning is the science of getting com...,Introduction\nWelcome to Machine Learning! In ...,Computer Science
1,"Information Systems Auditing, Controls and Ass...",The Hong Kong University of Science and Techno...,Coursera,Garvin Percy DIAS,Free Online Course (Audit),99.972306,Beginner,"4 weeks long, 8 hours worth of material",English,"[Arabic, French, Portuguese, Italian, German, ...",The course is awarded The Best Free Online Cou...,Introduction to Information Systems (IS) Audit...,Computer Science
2,Unlocking Information Security I: From Cryptog...,Tel Aviv University,edX,Avishai Wool and Dan Gittik,Free Online Course (Audit),149.0,Intermediate,"5 weeks long, 4-6 hours a week",English,"[Arabic, English, Hebrew]",Information Security is everywhere: as the wor...,,Computer Science
3,Elements of AI,,,,,99.972306,,,,[],The goal of this course is to demystify AI\nTh...,Part 1\nWhat is AI?\nPart 2\nSolving problems ...,Computer Science
4,Cyber Security,UGC,Swayam,Prof. Padmavathi G,Free Online Course,99.972306,,15 weeks long,English,[],1. Why this Course on Cyber Security?The cours...,Week - 1\n1.Introduction to Cyber Space2.Intro...,Computer Science
5,Introduction to Cyber Security,"Uttarakhand Open University, Haldwani",Swayam,Dr. Jeetendra Pande,Free Online Course,99.972306,,12 weeks long,English,[],Internet has led to widespread and drastic cha...,Week 1\nIntroduction to Cyber Space\nHistory o...,Computer Science
6,Introduction to Computer Science and Programmi...,Massachusetts Institute of Technology,edX,"Ana Bell, Eric Grimson and John Guttag",Free Online Course (Audit),75.0,Beginner,"9 weeks long, 14-16 hours a week",English,[English],This course is the first of a two-course seque...,,Computer Science
7,CS50's Introduction to Computer Science,Harvard University,edX,David J. Malan,Free Online Course (Audit),149.0,Beginner,"12 weeks long, 6-18 hours a week",English,[English],"This is CS50x , Harvard University's introduct...",,Computer Science
8,Computational Social Science Methods,"University of California, Davis",Coursera,Martin Hilbert,Free Online Course (Audit),99.972306,Beginner,"4 weeks long, 11 hours worth of material",English,"[Arabic, French, Portuguese, Italian, German, ...",This course gives you an overview of the curre...,Computational Social Science (CSS)\nIn this mo...,Computer Science
9,"Divide and Conquer, Sorting and Searching, and...",Stanford University,Coursera,Tim Roughgarden,Free Online Course (Audit),99.972306,Intermediate,"4 weeks long, 16 hours worth of material",English,"[Arabic, French, Portuguese, Italian, German, ...",The primary topics in this part of the special...,"Week 1\nIntroduction; ""big-oh"" notation and as...",Computer Science


### clean texts

In [None]:
def remove_special_characters(text):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

In [None]:
def tokenize_text(text):
  return text.lower().split()

In [None]:
nltk.download('stopwords')
stopword = stopwords.words('english')

def remove_stopwords(tokenized_overview):
  cleaned_list = []
  for word in tokenized_overview:
    if word not in stopword:
      cleaned_list.append(word)
  return cleaned_list

In [None]:
ps = PorterStemmer()

def stemmer(tokenized_overview):
  stemmed_list = []
  for word in tokenized_overview:
    stemmed_list.append(ps.stem(word))
  return stemmed_list

In [None]:
nltk.download('wordnet')
wn = WordNetLemmatizer()

def lemmatizer(tokenized_overview):
  lemmatized_list = []
  for word in tokenized_overview:
    lemmatized_list.append(wn.lemmatize(word))
  return lemmatized_list

In [None]:
def l2s(tokenize_text):
  text = " " 
  return (text.join(tokenize_text))

In [None]:
df['x'] = df['x'].apply(remove_special_characters)
df['tokenized_x'] = df['x'].apply(tokenize_text)
df['tokenized_x'] = df['tokenized_x'].apply(remove_stopwords)
df['tokenized_x'] = df['tokenized_x'].apply(stemmer)
df['tokenized_x'] = df['tokenized_x'].apply(lemmatizer)
df['cleaned_x'] = df['tokenized_x'].apply(l2s)