In [0]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# Set up directory variables
data_dir = "/content/gdrive/My Drive/Term Project 530/Data/"

# student_genie_file = data_dir + "StudentGenieMessages.csv"
# student_genie_teacher_file = data_dir + "StudentTeacherMessages.csv"
# short_student_genie_file = data_dir + "short_StudentGenie.csv"

first_input = data_dir + "first_5000_StudentGenie.csv"
second_input = data_dir + "second_5000_StudentGenie.csv"
third_input = data_dir + "third_5000_StudentGenie.csv"
fourth_input = data_dir + "fourth_5000_StudentGenie.csv"
fifth_input = data_dir + "fifth_5000_StudentGenie.csv"
sixth_input = data_dir + "sixth_5000_StudentGenie.csv"
seventh_input = data_dir + "seventh_5000_StudentGenie.csv"

first_output = data_dir + "norm_first_5000_StudentGenie.csv"
second_output = data_dir + "norm_second_5000_StudentGenie.csv"
third_output = data_dir + "norm_third_5000_StudentGenie.csv"
fourth_output = data_dir + "norm_fourth_5000_StudentGenie.csv"
fifth_output = data_dir + "norm_fifth_5000_StudentGenie.csv"
sixth_output = data_dir + "norm_sixth_5000_StudentGenie.csv"
seventh_output = data_dir + "norm_seventh_5000_StudentGenie.csv"

In [0]:
pip install ekphrasis

In [0]:
import nltk
nltk.download('stopwords')

In [0]:
# utils.py

import csv

def import_data(filename):
    data_dict = {}
    with open(filename, newline='', encoding='latin') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data_dict[row['stud']] = row['content']

    return data_dict

def split_sentences(data_dict):
    split_dict = {}
    for student in data_dict:
        sents = data_dict[student].split('|')
        sents = [s.strip() for s in sents]
        split_dict[student] = sents

    return split_dict

def old_split_data_dict(split_dict, n):
    dicts = [dict() for i in range(n)]
    size = len(split_dict)
    batch_size = int(size / n)
    start = 0
    end = 1
    for i, student in enumerate(split_dict):
        lower = batch_size * start
        upper = batch_size * end
        if (i >= lower and i < upper) and (start < n):
            dicts[start][student] = split_dict[student]
            if i + 1 == upper:
                start += 1
                end += 1

    return dicts
  
def split_data_dict(split_dict, n):
    dicts = [dict() for i in range(n+1)]
    size = len(split_dict)
    batch_size = int(size / n)
    start = 0
    end = 1
    for i, student in enumerate(split_dict):
        lower = batch_size * start
        upper = batch_size * end
        if (i >= lower and i < upper):
            dicts[start][student] = split_dict[student]
            if i + 1 == upper and start < n:
                start += 1
                end += 1

    return dicts

def export_normalized_data(datadict, outfile, normalized):
    print("Exporting data...")
    with open(outfile, 'w', newline='') as csvfile:
        fieldnames = ['stud', 'messages']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for student in normalized:
            row = {
                'stud': student,
                'messages': normalized[student],
            }
            writer.writerow(row)


In [0]:
import argparse
import csv
import itertools
import os
import re
import string

from ekphrasis.classes.segmenter import Segmenter
from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.tokenizer import SocialTokenizer, Tokenizer
from ekphrasis.classes.preprocessor import TextPreProcessor
from nltk.corpus import stopwords


class Normalizer(TextPreProcessor):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.remove_tags = kwargs.get("remove_tags", True)

        self.tags = ['<repeated>', '<emphasis>', '<email>']

        # add custom stopwords from your dataset here
        custom_stopwords = [
            'from', 
            'subject', 
            're', 
            'edu', 
            'use',  
            'propername', 
            'hey', 
            'dear', 
            'genie', 
            'message', 
            'messages',
            'ginie',
        ]
        custom_stopwords = set(custom_stopwords)
        nltk_stopwords = set(stopwords.words('english'))

        self.stopwords = custom_stopwords.union(nltk_stopwords)

        self.seg = Segmenter()
        self.spell = SpellCorrector()

        self.elongated = self.regexes["elongated"]
        self.mini_elongated = re.compile("([a-zA-Z])\\1\\1")
        self.propername_regex = "(propername)+"
        self.repeated_digits = re.compile("\d{5,}")

    def handle_repeated_digits(self, sentence: str):
        s = re.sub(self.repeated_digits, '', sentence)
        return s

    def handle_elongated(self, sentence: str):
        s = re.sub(self.mini_elongated, '', sentence)
        s = self.elongated.sub(
            lambda w: self.handle_elongated_match(w), s)
        return s

    def handle_repeated_propername(self, sentence: str):
        regex = re.compile(self.propername_regex)
        s = re.sub(regex, '', sentence).strip()
        return s

    def correct(self, sentence: str):
        sentence = sentence.split()
        corrected = [self.spell.correct(w) if len(w) < 25 else '' for w in sentence]
        return " ".join(corrected)

    def segment(self, sentence: str):
        s = sentence.split()
        segmented = []
        for tok in s:
            if len(tok) < 50:
                seg_list = self.seg.segment(tok).split()
                segmented.extend(seg_list)
            else:
                segmented.append('')
        segmented = list(itertools.chain(segmented))
        return " ".join(segmented)

    def strip_tags(self, sentence: list):
        if self.remove_tags:
            s = sentence
            # s = sentence.split()
            no_tags = [w for w in s if w not in self.tags]
            return " ".join(no_tags)
        else:
            return sentence

    def remove_stopwords(self, sentence: str):
        s = sentence.split()
        cleaned = [w for w in s if w.lower() not in self.stopwords]
        return " ".join(cleaned)

    def strip_non_alphanumeric(self, sentence: str):
        return "".join([c for c in sentence if c in string.printable])

    def strip_punctuation(self, sentence: str):
        '''
        Remove all punctuation from the given sentence,
        including emojis like :)
        '''
        punct = string.punctuation
        s = sentence.translate(str.maketrans(punct, ' ' * len(punct)))
        return " ".join(s.split())
      
    def handle_long_strings(self, sentence: str):
        if ' ' not in sentence:
            p = [c for c in sentence if c in string.punctuation]
            if len(sentence) > 50 and not p:
                return ''
        return sentence

    def normalize(self, split_dict):
        l = len(split_dict)
        normalized = {}
        for j, student in enumerate(split_dict):
            if j % 20 == 0:
              print(f"{j}/{l}")
            cleaned = []
            for i, msg in enumerate(split_dict[student]):
                m = msg.lower()
                m = self.handle_long_strings(m)
                m = self.strip_non_alphanumeric(m)
                m = self.pre_process_doc(m)
                m = self.strip_tags(m)
                m = self.remove_stopwords(m)
                m = self.strip_punctuation(m)
                m = self.handle_repeated_digits(m)
                m = self.handle_elongated(m)
                m = self.handle_repeated_propername(m)
                m = self.segment(m)
                m = self.correct(m)
                m = self.remove_stopwords(m)
                cleaned.append(m)
            cleaned = " | ".join(cleaned)
            normalized[student] = cleaned
        return normalized


In [0]:
import argparse
from ekphrasis.classes.tokenizer import SocialTokenizer, Tokenizer

parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true')
parser.add_argument('--dev', action='store_true')
parser.add_argument('--test', action='store_true')


def get_normalizer():
    normalizer = Normalizer(
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
        #"elongated",
        annotate={"repeated", 'emphasis', 'censored'},
        remove_tags=True,
        unpack_hashtags=False,
        segmenter='english',
        spell_correction=False,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize
    )

    return normalizer

def main(filename, outfile):
    data_dict = import_data(filename)
    split_dict = split_sentences(data_dict)
    
    normalizer = get_normalizer()
        
    norm = normalizer.normalize(split_dict)

    export_normalized_data(data_dict, outfile, norm)


In [0]:
outfile = sixth_output
infile = sixth_input
main(infile, outfile)