# Task 2
# Method
The process performed to do the task 2:
- Normalise tokens to lowercase except the capital tokens appeared in the middle of a sentence/line
- Tokenize words by using the regular expression "\w+(?:[-']\w+)?"
- Return a set of vocabulary with first 200 meaningfull bigrams included in
- Remove the context-independent and context-dependent stopwords from the vocabulary
- Remove rare tokens from the vocabulary
- Stem tokens by using Porter stemmer
- Remove tokens with the length less than 3 from the vocabulary

Step 1: Import necessary modules for task 2 such as pandas and numpy and read the csv file

In [5]:
import pandas as pd
import numpy as np
data = pd.read_csv('unit.csv')
data.head()

Unnamed: 0,Title,Synopsis,Outcomes
0,FIT2100,This unit will provide students with the knowl...,['analyse and evaluate various strategies used...
1,,"how a multi-programming, multi-user operating",operating system in managing the system resources
2,,systems operates and it manages and allocate,"and running applications efficiently;', 'analy..."
3,,resources to different applications. Students ...,identify parameters that can improve the perfo...
4,,able to compare and contrast various resource,"of multi-programming operating systems;', 'app..."


Step 2: Reformat the data

In [6]:
data.Title = data.Title.ffill()
#Replace null values in Title axis by values before

data_synopsis = data.groupby('Title')['Synopsis'].apply(lambda x: ' '.join([y for y in x if not pd.isnull(y)]))
#Join rows of Synopsis column accroding to title and put the result into a table called data_synopsis

data_outcomes = data.groupby('Title')['Outcomes'].apply(lambda x: ' '.join([y for y in x if not pd.isnull(y)]))
#Join rows of Outcomes column accroding to title and put the result into a table called data_outcomes

data_outcomes = data_outcomes.to_frame()
data_synopsis = data_synopsis.to_frame()
#Convert data_outcomes and data_synopsis into dataframe

data = pd.merge(data_synopsis, data_outcomes, on = 'Title')
#Merge two dataframe above into a dataframe

data.reset_index(inplace=True)
#Reformat the dataframe

data.head()

Unnamed: 0,Title,Synopsis,Outcomes
0,ACB2851,"The objective of this unit is two-fold. First,...",['examine the role of accounting information s...
1,ACS2700,This unit provides an introduction to ethical ...,['examine the ethical dimension of individual ...
2,ACW3050,Topics covered by this unit include: Australia...,['critically analyse how standard setting proc...
3,AHT3105,This unit considers contemporary international...,['Identify the wider concerns of contemporary ...
4,AMU1305,Film studies: Forms and approaches gives stude...,['Analyse films with particular attention to t...


Step 3: Remove unecessary character in each column

In [7]:
#Import re module to use regular expression
import re

#Create a function that returns any content from after ' or " to before ' or " and , or ]
def clean_outcomes(text):
    if pd.isnull(text):
    #For rows are null, return an empty list
        return []
    r = r"['\"](.*?)['\"](?=[,\]])"
    #The pattern will match any content from after ' or " to before ' or " and , or ]
    return re.findall(r, text)

data.Outcomes = data.Outcomes.apply(lambda x: clean_outcomes(x))
#Apply the function into the content of each unit in Outcomes column

Step 4: Normalize tokens to lowercase
- First, tokenize text into sentences
- Second, normalize tokens to lowercase except the capital tokens appeared in the middle of the sentence

In [8]:
#First, tokenize text into sentences
#Install nltk, nltk data and import sent_tokenize tool from nltk
import nltk
nltk.download('punkt')
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

#Create a function to tokenize text into sentences
def segmentation(text):
    #For content are null, return an empty list. Otherwise, tokenizing content
    if pd.isnull(text):
        return []
    return sent_detector.tokenize(text)

# Apply tokenizing function into content of each unit of column Outcomes and Synopsis 
# The result will be a list of each sentence
data.Outcomes = data.Outcomes.apply(lambda x: [z for y in x for z in segmentation(y)])
data.Synopsis = data.Synopsis.apply(lambda x: segmentation(x))

# Second, normalize first character of each sentence to lowercase except the capital tokens 
# appeared in the middle of the sentence of column Outcomes and Synopsis 
for x in range(data.Outcomes.count()):
    for y in range (len(data.Outcomes[x])):
        data.Outcomes[x][y] = data.Outcomes[x][y][0].lower() + data.Outcomes[x][y][1:]
for x in range(data.Synopsis.count()):
    for y in range (len(data.Synopsis[x])):
        data.Synopsis[x][y] = data.Synopsis[x][y][0].lower() + data.Synopsis[x][y][1:]
        
data.Outcomes[0]
#data.Synopsis[0]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikeloongboong/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['examine the role of accounting information systems in analysing and providing decision support to managers',
 'explain the design of accounting information systems and financial models',
 'develop financial models to assist in decision making',
 'apply critical thinking, problem solving and presentation skills to individual and / or group activities dealing with accounting information systems and demonstrate in an individual summative assessment task the acquisition of a comprehensive understanding of the topics covered by ACB2851.']

Step 5: Tokenize word by using the regular expression "\w+(?:[-']\w+)?"

In [9]:
#Merge the content of column Synopsis and Outcomes into a new column Content
data["Content"] = data[["Synopsis", "Outcomes"]].apply(lambda x: x[0] + x[1], axis=1)

# Import tokenize module
from nltk.tokenize import RegexpTokenizer 

# Create a function that breaks a long sequence of characters into word tokens
def tokenizer(sentence):
    tokenizer = RegexpTokenizer(r"\w+(?:[-.]\w+)?")
    unigram_tokens = tokenizer.tokenize(sentence)
    return unigram_tokens

#Apply the function above into the content of column Content
data['Content_tokens'] = data.Content.apply(lambda x: [z for y in x for z in tokenizer(y)])

#data.Content_tokens[4]

Step 6: Return a set of vocabulary with first 200 meaningfull bigrams included in

In [10]:
#Append all unigrams that extracted from step 5 into an empty list
i = 0
all_tokens = []
while i < data.Content_tokens.count():
    all_tokens += data.Content_tokens[i]
    i += 1
    
#Find first 200 meaningfull bigrams from all tokens
#Import tools to identify collocations and provide functionalities, dependent on being provided a 
#function which scores angram given appropriate frequency counts
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
#Find bigram collocations from all tokens
finder = BigramCollocationFinder.from_words(all_tokens)
#Take first 200 bigrams 
bigrams = finder.nbest(BigramAssocMeasures.pmi, 200)

#Tokenize with 200 bigrams extracted above 
from nltk.tokenize import MWETokenizer
mwe_tokenizer = MWETokenizer(bigrams)
all_tokens = mwe_tokenizer.tokenize(all_tokens)

#Retokenize the column Cotent_tokens 
tokenize_content_tokens = []
for each in data.Content_tokens:
    each = mwe_tokenizer.tokenize(each)
    tokenize_content_tokens.append(each)
    
for i in range(len(tokenize_content_tokens)):
    data.Content_tokens[i] = tokenize_content_tokens[i]

#Convert the list of all tokens into a set to remove tokens which are the same with others
vocab = set(all_tokens)

#vocab

Step 7: Remove the context-independent and context-dependent stopwords from the vocabulary

In [11]:
#First, remove stopwords from the vocabulary
with open('./stopwords_en.txt') as f:
    stopwords = set(f.read().splitlines())
vocab = vocab - stopwords

# Second, remove context-dependent from the vocabulary

# Import module that helps identify common words
from nltk.probability import *
from itertools import chain
all_tokens2 = list(chain.from_iterable([set(token) for token in data.Content_tokens]))
# Compute this distribution from a set of word tokens all_tokens2
fd_2 = FreqDist(all_tokens2)
# fd_2.most_common()

# Remove words with the threshold set to %95 from the vocabulary
frequent_words = set()
for value in fd_2.most_common():
    if value[1] > 0.95 * data.Title.count():
        frequent_words.add(value[0])
# print(frequent_words)
vocab = vocab - frequent_words
# vocab

Step 8: Remove rare tokens from the vocabulary

In [12]:
#Remove words with the threehold set to %5 from the vocabulary
rare_words = set()
for value in fd_2.most_common():
    if value[1] < 0.05 * data.Title.count():
        rare_words.add(value[0])
vocab = {w for w in vocab if w not in rare_words}
#vocab = list(vocab)
# vocab

Step 9: Stem tokens by using Porter stemmer

In [13]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
vocab_stemmer = set()
for each in vocab:
    vocab_stemmer.add(ps.stem(each))
# vocab_stemmer

Step 10: Remove tokens with the length less than 3 from the vocabulary

In [14]:
set_three_words = set()
for each in vocab_stemmer:
    if len(each) < 3:
        set_three_words.add(each)
vocab_stemmer = vocab_stemmer - set_three_words

Step 11: Extract vocabulary into a text file

In [15]:
#Create a dictionary with for each word in the list vocab_stemmer above
inverse_vocab = {}
for i, each in enumerate(sorted(vocab_stemmer)):
    inverse_vocab[each] = i
    
with open('29911508_vocab.txt', 'w') as f:
    for k, v in inverse_vocab.items():
        f.write(f'{k}:{v}\n')

Step 12: Create a function that counts vector of tokens and extracts data into a text file

In [16]:
#Stem tokens of each unit of the column Content_tokens
ps = PorterStemmer()
data['Content_tokens_stem'] = data['Content_tokens'].apply(lambda x: [ps.stem(y) for y in x])

#Create a new column Vector to count vector of each token
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=inverse_vocab, lowercase=False, preprocessor=lambda x: x,
                            tokenizer=lambda x: x)
data['Vector'] = data['Content_tokens_stem'].apply(lambda x: vectorizer.fit_transform([x]))

def write_countvec(file, content):
    title = content[0]
    file.write(f'{title}, ')
    
    vector = content[1]
    vector_str = ','.join([f'{i}:{c}' for i, c in zip(vector.indices, vector.data)])
    file.write(vector_str)
    file.write('\n')
        
with open('countVec.txt', 'w') as f:
    data[['Title', 'Vector']].apply(lambda x: write_countvec(f, x), axis=1)