In [98]:
import numpy as np
from random import choice
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import math
from collections import Counter
from nnsplit import NNSplit
import random
import pickle

# Read Data

In [99]:
def dataNormalize(raw_path, new_article=None):

    with open(raw_path, "rb") as fp:
        train = pickle.load(fp)

    token_set = []
    for i in range(len(train)):
        word_tokens = tokenizer.tokenize(train[i]['content'])
        filtered_sentence = []
        for w in word_tokens:
            filtered_sentence.append(lemmatizer.lemmatize(w))
        token_set.append(filtered_sentence)

    if new_article != None:
        word_tokens = tokenizer.tokenize(new_article)
        filtered_sentence = []
        for w in word_tokens:
            filtered_sentence.append(lemmatizer.lemmatize(w))
        token_set.append(filtered_sentence)

    return token_set

In [100]:
def genTfidf(token_set, header=None):
    
    IDF = {}
    showed = {}
    for i in range(len(token_set)):
        showed = {}
        for word in token_set[i]:
            if word not in IDF:
                IDF[word] = 1
            else:
                if word not in showed:
                    showed[word] = 1
                    IDF[word] += 1
                else:
                    continue
    for key in IDF.keys():
        IDF[key] = math.log(len(token_set)/IDF[key], 10)
    IDF = {k: v for k, v in sorted(IDF.items(), key=lambda item: item[1], reverse=True)}
    
    tfidf_arr = []
    for i in range(len(token_set)):
        TF = Counter(token_set[i])
        TFIDF = {}
        for key in TF.keys():
            try:
                TFIDF[key] = TF[key] * IDF[key]
            except:
                print(key)
                
        TFIDF = sorted(TFIDF.items(), key=lambda item: item[1], reverse=True)
        tfidf_arr.append(TFIDF)
    
    return tfidf_arr

# N-gram

In [101]:
def ngrams(arr, n):
    
    grams = []
    for i in range(len(arr)):
        if i < len(arr) - n + 1:
            temp = tuple([arr[i+j] for j in range(n)])
            grams.append(temp)
            
    return grams

In [102]:
def nGramLM(data, n):
    
    nGramDict = {}
    for i in range(len(data)):
        trainData = ngrams(data[i], n)
        for j in range(len(trainData)):
            termTuple = trainData[j]
            if termTuple not in nGramDict.keys():
                nGramDict[termTuple] = 1
            else:
                nGramDict[termTuple] += 1
    
    return nGramDict

In [103]:
def chooseBigramWord(cfd, tfidf, key):
    
    candidateKey = []
    for i in cfd.keys():
        if i[0] == key:
            candidateKey.append(i)
            
    pList = []
    for i in candidateKey:
        pList.append(cfd[i]*tfidf[i[1]])
    
    res = random.choices(population=candidateKey, weights=pList, k=1)
    
    return res[0][1]

In [104]:
def chooseTrigramWord(cfd, tfidf, firstKey, secondKey):
    
    candidateKey = []
    for i in cfd.keys():
        if i[0] == firstKey and i[1] == secondKey:
            candidateKey.append(i)
            
    pList = []
    for i in candidateKey:
        pList.append(cfd[i]*tfidf[i[2]])
    
    assert(len(pList) != 0)
    
    res = random.choices(population=candidateKey, weights=pList, k=1)
    
    return res[0][2]

In [105]:
def generateArticle(cfd, tfidf, word, n, num=10):
    
    arr = []
    arr.append(word)
    
    if n == 2:
        for i in range(num):
            newWord = chooseBigramWord(cfd, tfidf, word)
            arr.append(newWord)
            word = newWord
    
    elif n == 3:
        newWord = chooseBigramWord(cfd, tfidf, word)
        arr.append(newWord)
        firstWord = word
        secondWord = newWord
        for i in range(num-1):
            newWord = chooseTrigramWord(cfd, tfidf, firstWord, secondWord)
            arr.append(newWord)
            firstWord = secondWord
            secondWord = newWord
            
    else:
        print("N-gram Error")
            
    return arr[:num]

In [106]:
def project(data, tfidf, artNum, n, senLen):
    
    tfidf = dict((x, y) for x, y in tfidf[artNum])
    max_value = max(tfidf.values())
    max_keys = [k for k, v in tfidf.items() if v == max_value]
    lm = nGramLM([data[artNum]], n)
    art = generateArticle(lm, tfidf, max_keys[0], n, senLen)
    
    return art

In [107]:
def splitSentence(arr):
    
    text = " ".join(arr)
    splitter = NNSplit.load("en")
    splits = splitter.split([text])[0]
        
    return splits[0]

In [127]:
def addFarmWord(title):
    
    farm_word = ["I was stunned after reading it!!! ", "The whole audience collapsed!!! ", "I can't believe my eyes!!!", 
                "I watched it three times and still can't believe it!!! ", "You must see to the end!!! ", "My jaw dropped!!! ",
                "This is not true, is it!!! ", "Let the experts fall through the glasses!!! ", "I jumped up from the chair!!! ",
                "Unexpected news!!! ", "This is really incredible!!! ", "News that has never been revealed!!! ",
                "Frightened everyone!!! ","I guarantee you have never seen it!!! ", "Unbelievable fact!!! "]

    res = random.choice(farm_word) + title
    res = res[:-1] + "!"
    
    return res

# Main

In [128]:
def catchWord(path, text):
    
    token_set = dataNormalize(path, text)
    tfidf = genTfidf(token_set)
    n = 3
    senLen = 20
    res = project(token_set, tfidf, len(token_set)-1, n, senLen)
    sentence = splitSentence(res)
    
    print(addFarmWord(str(sentence)))

In [129]:
if __name__ == '__main__':
    
    rawPath = "./SPORTS_Raw.pkl"
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    text = "The Houston Rockets are moving on from franchise superstar James Harden. They have traded the 31-year-old to the Brooklyn Nets as part of a three-team deal, the Nets announced on Thursday. In return for Harden, Houston is acquiring Caris LeVert and Rodions Kurucs from the Nets, Dante Exum from the Cleveland Cavaliers, three first-round picks from the Nets, one first-round pick from the Cavaliers via the Milwaukee Bucks, and four first-round pick swaps from the Nets. In a separate deal, Houston is trading LeVert and a second-round pick to the Indiana Pacers for guard and two-time All-Star Victor Oladipo, according to The Athletic's Shams Charania. Harden, an eight-time All-Star, was acquired by the Rockets from the Oklahoma City Thunder in 2012. While in Houston, he was voted the league's best player for the 2017-18 season and led the Rockets to the playoffs in all eight years. The postgame comments were the last of a string a of negative behavior from the disgruntled star, after arriving late to the team's training camp, and then being sidelined for four days and fined $50,000 by the NBA for violating the league's health and safety protocols days before the start of the season. The former MVP now reunites with former Thunder teammate Kevin Durant and perennial All-Star guard Kyrie Irving in Brooklyn."
    catchWord(rawPath, text)

The whole audience collapsed!!! Nets In a separate deal Houston is trading LeVert and Rodions Kurucs from the Cavaliers via the Milwaukee Bucks an!
