## Data preparation for user interface

Computing TFIDF is time consuming for such a large data set. It is important to make our user interface fast and accurate. We decided to pre-precess and pre-calculate the needed data and store it in the user interface folder. "weights_dict_bytime" and "doc_dict" are what we finally use in our web-app.

### Convert Data to Fasttext Model format

In [None]:
import fasttext
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import pickle

In [None]:
# Load the model        
model = fasttext.load_model("model.bin")

In [None]:
def FormatConvert(df):
    data.summary.fillna(value=' ', inplace=True)
    comments = list(df['summary']+': ' + df['reviewText'])
    comments = [w.replace('\n', '') for w in comments]
    return comments

def readJson(filePath):
#     filePath = "data/" + filePath
#     with open(filePath) as data: 
    data = json.load(filePath)
    data = pd.DataFrame(data)        
    # Convert the string time back to datetime
    data["reviewTime"] = pd.to_datetime(data.reviewTime)
    data = data[data['reviewText'].notna()]
    return data

def subsetData(data, pct):
    if data.shape[0] > 500000:
        data = data.sample(frac = pct)
    return data 

def saveJson(data, file_name):
#     file_name = 'data/' + file_name    
    # Store the datetime as string time in order to store in json
    data["reviewTime"] = data.reviewTime.dt.strftime('%Y-%m-%d')
    data.to_json(file_name, orient='records')
#     print("Saving..., file name:", file_name)

def textExtract_str(data):
    data = data[data['reviewText'].notna()]
    rawtext =  ' '. join(list(data['reviewText']))
    return rawtext

def removePuncLower(s):
    re_tok = re.compile(r'[^\w]+')
    s = re_tok.sub(' ', s).lower()
    return s

def computeTFIDF(corpus, columnnames, dfmax):    
    vectorizer = TfidfVectorizer(stop_words = 'english', max_df = dfmax, use_idf = True)
    print("transforming...")
    X = vectorizer.fit_transform(corpus)
    print("getting feature name...")
    feature_names = vectorizer.get_feature_names()
    print("densing list...")
    dense = X.todense()
    denselist = dense.tolist()
    print("reshaping list...")
    new_list = [list(x) for x in zip(*denselist)]
    print("converting to data frame...")
    df = pd.DataFrame(new_list, index=feature_names, columns = columnnames)
    return df

def save_obj(obj, path, name ):
    with open(path + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
filePath = 'data'
filelist = os.listdir(filePath)
for i in range(len(filelist)):
    filelist[i] = filelist[i][:-5]

### Add label to all the data

In [None]:
for i in range(len(filelist)):
# for i in range(1):
    with open('data/' + filelist[i] + '.json') as f:
        print("reading " + filelist[i] + '...')
        data = readJson(f)
        print("subseting " + filelist[i] + '...')
        data = subsetData(data, 0.1)
        print("processing " + filelist[i] + '...')
        text = FormatConvert(data)
 ####### modeling ####### 
        # Use the predict function 0: Neg, 1: Pos
        pred = model.predict(text)
        pred_labels = [0 if x == ['__label__1'] else 1 for x in pred[0]]
        data['pred_labels'] = pred_labels
        newpath = 'data/subsets_data/' + filelist[i] + '.json'
        print("saving " + newpath)
        saveJson(data, newpath)
        f.close()
        os.remove('data/' + filelist[i] + '.json')

### save TFIDF as dictionary for each department

In [None]:
time_period = [24,12,6,3]
senti_label = ['pos', 'neg', 'all']

weights_dict = defaultdict(dict)
for t in time_period:
    pos_list = []
    neg_list = []
    doc_list = []
    weights_dict[t] = defaultdict(dict)
    print("processing period %s" % (t))
    for i in range(len(filelist)):
    # for i in range(1):
        weights_dict[t][filelist[i]] = defaultdict(dict)
        with open(filePath + filelist[i] + '.json') as f:
            print("reading " + filelist[i] + '...')
            data = readJson(f)
            data = data[data.reviewTime > max(data.reviewTime) - timedelta(days=30 * t)]
            pos_text = textExtract_str(data[data['pred_labels'] == 1])
            pos_text = removePuncLower(pos_text)
            pos_list.append(pos_text)
            neg_text = textExtract_str(data[data['pred_labels'] == 0])        
            neg_text = removePuncLower(neg_text)
            neg_list.append(neg_text)
            text = textExtract_str(data)
            text = removePuncLower(text)
            doc_list.append(text)
        
    pos_tfidf_df = computeTFIDF(pos_list, filelist, 0.99)
    neg_tfidf_df = computeTFIDF(neg_list, filelist, 0.99)
    tfidf_df = computeTFIDF(doc_list, filelist, 0.99)
    
    for i in range(len(filelist)):
        top10 = tfidf_df[filelist[i]].nlargest(25).index
        top10value = list(tfidf_df[filelist[i]].nlargest(25))
        weights = {top10[i]:top10value[i] for i in range(25)}
        weights_dict[t][filelist[i]]['all'] = weights

        top10 = pos_tfidf_df[filelist[i]].nlargest(25).index
        top10value = list(pos_tfidf_df[filelist[i]].nlargest(25))
        weights = {top10[i]:top10value[i] for i in range(25)}   
        weights_dict[t][filelist[i]]['pos'] = weights

        top10 = neg_tfidf_df[filelist[i]].nlargest(25).index
        top10value = list(neg_tfidf_df[filelist[i]].nlargest(25))
        weights = {top10[i]:top10value[i] for i in range(25)} 
        weights_dict[t][filelist[i]]['neg'] = weights

In [None]:
save_obj(weights_dict, 'data/tfidf/', 'weights_dict_bytime')

### tfidf for uploaded file 
using the most recent 3 month data

In [None]:
doc_dict = defaultdict(list)
    
for i in range(len(filelist)):
# for i in range(1):
    with open(filePath + filelist[i] + '.json') as f:
        print("reading " + filelist[i] + '...')
        data = readJson(f)
        data = data[data.reviewTime > max(data.reviewTime) - timedelta(days=30 * 3)]
        pos_text = textExtract_str(data[data['pred_labels'] == 1])
        pos_text = removePuncLower(pos_text)
        doc_dict['pos'].append(pos_text)
        neg_text = textExtract_str(data[data['pred_labels'] == 0])        
        neg_text = removePuncLower(neg_text)
        doc_dict['neg'].append(neg_text)
        text = textExtract_str(data)
        text = removePuncLower(text)
        doc_dict['all'].append(text)

In [None]:
save_obj(weights_dict, 'data/tfidf/', 'doc_dict')