### import libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
import json
import string

### load dataset

In [2]:
df = pd.read_csv('../HomeworkThree/dataset.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 0 to 1099
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   course_name               1100 non-null   object
 1   course_instructor_site    1090 non-null   object
 2   course_site               1054 non-null   object
 3   course_instructor         1070 non-null   object
 4   course_cost               897 non-null    object
 5   course_credential         819 non-null    object
 6   course_level              725 non-null    object
 7   course_duration           1041 non-null   object
 8   course_language           1054 non-null   object
 9   course_caption_languages  716 non-null    object
 10  overview                  1060 non-null   object
 11  syllabus                  848 non-null    object
 12  subject                   1100 non-null   object
dtypes: object(13)
memory usage: 120.3+ KB


#### detect free courses and extract numbers

In [3]:
def process_cost(cost):
    cost = str(cost)
    
    if cost[0] == '$':
        cost = int(cost.split('.')[0][1:])
    elif 'Free' in cost or 'free' in cost:
        cost = 0
    else:
        cost = None

    return cost

In [4]:
df['course_credential'] = df['course_credential'].apply(process_cost)
df['course_credential'].fillna((df['course_credential'].mean()), inplace=True)

#### turn subtitle string into list of words

In [9]:
def str2list(text):
    text = str(text)
    if text == 'nan' or text == '':
        return []
    text = text.split(', ')
    return text

In [10]:
df['course_caption_languages'] = df['course_caption_languages'].apply(str2list)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 0 to 1099
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   course_name               1100 non-null   object 
 1   course_instructor_site    1090 non-null   object 
 2   course_site               1054 non-null   object 
 3   course_instructor         1070 non-null   object 
 4   course_cost               897 non-null    object 
 5   course_credential         1100 non-null   float64
 6   course_level              725 non-null    object 
 7   course_duration           1041 non-null   object 
 8   course_language           1054 non-null   object 
 9   course_caption_languages  1100 non-null   object 
 10  overview                  1060 non-null   object 
 11  syllabus                  848 non-null    object 
 12  subject                   1100 non-null   object 
dtypes: float64(1), object(12)
memory usage: 120.3+ KB


### clean texts

In [None]:
def remove_special_characters(text):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

In [None]:
def tokenize_text(text):
  return text.lower().split()

In [None]:
nltk.download('stopwords')
stopword = stopwords.words('english')

def remove_stopwords(tokenized_overview):
  cleaned_list = []
  for word in tokenized_overview:
    if word not in stopword:
      cleaned_list.append(word)
  return cleaned_list

In [None]:
ps = PorterStemmer()

def stemmer(tokenized_overview):
  stemmed_list = []
  for word in tokenized_overview:
    stemmed_list.append(ps.stem(word))
  return stemmed_list

In [None]:
nltk.download('wordnet')
wn = WordNetLemmatizer()

def lemmatizer(tokenized_overview):
  lemmatized_list = []
  for word in tokenized_overview:
    lemmatized_list.append(wn.lemmatize(word))
  return lemmatized_list

In [None]:
def l2s(tokenize_text):
  text = " " 
  return (text.join(tokenize_text))

In [None]:
df['x'] = df['x'].apply(remove_special_characters)
df['tokenized_x'] = df['x'].apply(tokenize_text)
df['tokenized_x'] = df['tokenized_x'].apply(remove_stopwords)
df['tokenized_x'] = df['tokenized_x'].apply(stemmer)
df['tokenized_x'] = df['tokenized_x'].apply(lemmatizer)
df['cleaned_x'] = df['tokenized_x'].apply(l2s)

In [None]:
df.head(10)