In [1]:
import pandas as pd
import re
from nltk import FreqDist
import glob
from nltk.stem.wordnet import WordNetLemmatizer
import enchant
import nltk
from pattern.text.en import singularize
import spacy

In [2]:
df = pd.read_csv('out/all_final.txt', sep=' : ', engine='python')

In [3]:
#get frequency of frequency 
f = df["frequency"].value_counts()
f.to_csv('frequency_number.txt')

In [6]:
acc = len(df["frequency"]) - f.cumsum(axis = 0)
acc
#acc.to_csv('frequency_number_acc.txt')

1       19907
2       17052
3       15193
4       13842
5       12784
        ...  
409         4
425         3
1474        2
4555        1
1983        0
Name: frequency, Length: 1123, dtype: int64

In [9]:
words = df["word"]
words.to_csv('words.txt', index=False)



In [17]:
d = "called"
print(singularize(d))
tokenized = nltk.word_tokenize(d)
nltk.pos_tag(tokenized)

called


[('called', 'VBN')]

In [16]:

# get word type
nlp = spacy.load("en_core_web_sm")
text = ("call")
doc = nlp(text)
verbs = [(token, token.tag_) for token in doc]
verbs

[(call, 'VB')]

In [3]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("ridden", 'v')

'ride'

In [42]:
dicts = enchant.Dict("en_US")
dicts.check("bodiless")

True

In [2]:
def isWordValide(word):
    return (not re.search(r"\d+", word)) and len(word) > 1

def single(word, dic):
    w = singularize(word)
    if word != w and dic.check(w):
        #print(f'{word} => {w}')
        return w
    return word

def origin(word, dic, lemmatizer):
    w = lemmatizer.lemmatize(word, 'v')
    if word != w and dic.check(w):
        #print(f'{word} => {w}')
        return w
    return single(word, dic)

def getFormatText(file):
    data = open(file).read().lower()
    data = re.sub(r'[^\x20-\x7F]+', '', data)
    data = re.sub(r'-\n', '', data)
    data = re.sub(r'\n+', ' ', data)
    data = re.sub(r'-', ' ', data)
    data = re.sub(r'\d+', '', data)
    data = re.sub(r'[?./!;,:*-+{}\[\]\\@#$%^&()_`~‘’]|“|”', '', data)
    data = re.sub(r' +', ' ', data)
    #data = re.sub(r'\W\d+\W', '', data)
    return data

def getWords(file):
    data = getFormatText(file)
    return [item.strip() for item in data.split(' ') if item.strip()]

def getWordsFrequencies(files):
    #nltk.download('punkt')
    word_dist = FreqDist()
    dicts = enchant.Dict("en_US")
    for f in files: 
        words = getWords(f)
        for item in words:
            if dicts.check(item) and isWordValide(item):
                word_dist.update([item])
    result = dict(word_dist)
    #result = dict(sorted(result.items(), key=lambda item: item[1]))
    return result

def getOrigins(wordFreq):
    lemmatizer = WordNetLemmatizer()
    dic = enchant.Dict("en_US")

    res = {}
    for item in wordFreq.keys():
        try:
            o = origin(item, dic, lemmatizer)
            if o not in res:
                res[o] = wordFreq[item]
            else:
                res[o] += wordFreq[item]
        except:
            print(item)
        
    res = dict(sorted(res.items(), key=lambda item: item[1]))
    return res




In [3]:
#get the word frequency
files = glob.glob("texts/sub/*")
i = 0
for file in files:
    i += 1
    wordFreq = getWordsFrequencies([file])
    wordFreq = dict(sorted(wordFreq.items(), key=lambda item: item[1]))

    out_file = open(f'out/subs/s_{i}.txt', 'w')
    out_file.write(f'{file}\n')
    out_file.write('word : frequency\n')
    for item in wordFreq.keys():
        out_file.write(f'{item} : {wordFreq[item]}\n')
    print(f'{file} --- > Done')


1
texts/sub\Halt and Catch Fire.S01E01.I O.srt --- > Done
2
texts/sub\Halt and Catch Fire.S01E02.FUD.srt --- > Done
3
texts/sub\Halt and Catch Fire.S01E03.High Plains Hardware.srt --- > Done
4
texts/sub\Halt and Catch Fire.S01E04.Close to the Metal.srt --- > Done
5
texts/sub\Halt and Catch Fire.S01E05.Adventure.srt --- > Done
6
texts/sub\Halt and Catch Fire.S01E06.Landfall.srt --- > Done
7
texts/sub\Halt and Catch Fire.S01E07.Giant.srt --- > Done
8
texts/sub\Halt and Catch Fire.S01E08.The 214s.srt --- > Done
9
texts/sub\Halt and Catch Fire.S01E09.Up Helly Aa.srt --- > Done
10
texts/sub\Halt and Catch Fire.S01E10.1984.srt --- > Done
11
texts/sub\Mr.Robot.S03E01.720p.BluRay.x264-DEMAND.srt --- > Done
12
texts/sub\Mr.Robot.S03E02.720p.BluRay.x264-DEMAND.srt --- > Done
13
texts/sub\Mr.Robot.S03E03.720p.BluRay.x264-DEMAND.srt --- > Done
14
texts/sub\Mr.Robot.S03E04.720p.BluRay.x264-DEMAND.srt --- > Done
15
texts/sub\Mr.Robot.S03E05.720p.BluRay.x264-DEMAND.srt --- > Done
16
texts/sub\Mr.Robo

In [3]:
files = glob.glob("out/final/*")
allWords = {}
for f in files:
    df = pd.read_csv(f, sep=' : ', engine='python', skiprows=[])
    for index, row in df.iterrows():
        word = row['word']
        frequency = row['frequency']
        if word not in allWords:
            allWords[word] = frequency
        else:
            allWords[word] += frequency
    print(f'{f}-{len(df["word"])} --> Done')
len(allWords)


out/final\final_words.txt-23106 --> Done
out/final\final_words_subs.txt-12573 --> Done


25284

In [5]:
#write the All words
allWords = dict(sorted(allWords.items(), key=lambda item: item[1]))
out_file = open('out/all_final.txt', 'w')
out_file.write('word : frequency\n')
for item in allWords.keys():
    out_file.write(f'{item} : {allWords[item]}\n')
len(allWords)



25284

In [16]:
#origin words

final_words = pd.read_csv('out/all_words_subs.txt', sep=' : ', engine='python')
out_file = open('out/final_words_subs.txt', 'w')
out_file.write('word : frequency\n')

test = {}
for index, row in final_words.iterrows():
    word = row['word']
    frequency = row['frequency']
    test[word] = frequency
print(len(final_words['word']))
print(len(test))

final_words = getOrigins(test)
print(len(final_words))

out_file.write('word : frequency\n')
for item in final_words.keys():
    out_file.write(f'{item} : {final_words[item]}\n')


17708
17708
nan
12573
