# Import libraries

In [1]:
import os
import json
import csv
from pathlib import Path
import re
import random
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import *


Loading Training Data
Training data is in the following format, but as a CSV:
```json
{
  "review_id":string"xQY8N_XvtGbearJ5X4QryQ"
  "user_id":string"OwjRMXRC0KyPrIlcjaXeFQ"
  "business_id":string"-MhfebM0QIsKt87iDN-FNw"
  "stars":int2
  "useful":int5
  "funny":int0
  "cool":int0
  "text":string"As someone who has worked with many museums, I was eager to visit this gallery on my most recent trip to Las Vegas. When I saw they would be showing infamous eggs of the House of Faberge from the Virginia Museum of Fine Arts (VMFA), I knew I had to go!."
  "date":string"2015-04-15 05:21:16"
}
```

In [None]:
"""
for dirname, _, filenames in os.walk('/content'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
"""

"\nfor dirname, _, filenames in os.walk('/content'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n"

# Install fastText
fastText is a library for efficient learning of word representations and sentence classification. Documentation can be found on https://fasttext.cc/docs/en/support.html.

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l[K     |████▊                           | 10 kB 13.8 MB/s eta 0:00:01[K     |█████████▌                      | 20 kB 18.4 MB/s eta 0:00:01[K     |██████████████▎                 | 30 kB 21.4 MB/s eta 0:00:01[K     |███████████████████             | 40 kB 23.6 MB/s eta 0:00:01[K     |███████████████████████▉        | 51 kB 26.3 MB/s eta 0:00:01[K     |████████████████████████████▋   | 61 kB 27.1 MB/s eta 0:00:01[K     |████████████████████████████████| 68 kB 6.2 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.8.1-py2.py3-none-any.whl (208 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3123789 sha256=147f02f77d2ec3d09d69ad911ed2a94c5216acc9dd3fbb0debbf27b6d4a27f5f
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a

# Data Formatting
* Since we are using fastText, we need to convert the data into the format that fastText library expects.
* fastText requires text file with each piece of text on separate line.
* We also need to assign the special prefix at the start of each line in the format __label__YOURLABEL. This prefix will serve as label for that piece of text.
* We will use no of stars to label each review. So our final formatted data will loook like as below,
```
__label__5 This restaurant is great!
__label__1 This restaurant is terrible :'(
```

In [None]:
def transform_text(rating, text):
    fasttext_line = "__label__{} {}".format(rating, text)
    return fasttext_line    

transform_text('5','This restaurant is great!')

'__label__5 This restaurant is great!'

# Text Normalization
* For more details please refer [Introduction to NLP](https://www.kaggle.com/satishgunjal/introduction-to-nlp#Text-Normalization)
* Here we are going to convert our text to lowercase and then add space before every punctuation.(fastText API do not consider language conventions. So 'hey' and 'hey!' are two different words for it)

In [3]:
def string_formatting(string):
    """This function will convert input text to lowercase and also add space before punctuation symbol."""
    string = string.lower()
    string = re.sub(r"([.!?,'/()])", r" \1 ", string) # The sub() function replaces the matches with the text of your choice
    return string

string_formatting('This restaurant is great!')

'this restaurant is great ! '

# Load Stop Words

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Create Training & Test Data
* We are going to create training and test dataset from available data. For more details please refer Train Test Split
* Here we are going to create fasttext_dataset_train.txt for training the model and fasttext_dataset_test.txt for testing the model.

In [4]:
# Defining the path of  training and test files
reviews_data = Path('/content/drive/MyDrive/dataset/HomeworkData.csv')
fasttext_dataset_train = Path('/content/fasttext_dataset_train.txt') # Text file to store data in required format
fasttext_dataset_test = Path('/content/fasttext_dataset_test.txt') # Text file to store data in required format

# 10% data for testing
percent_test_data = 0.10

with reviews_data.open() as input, fasttext_dataset_train.open("w") as train_output, fasttext_dataset_test.open("w") as test_output:
    reader = csv.reader(input)
    fieldnames = next(reader)#获取数据的第一列，作为后续要转为字典的键名 生成器，next方法获取
    # print(fieldnames)
    csv_reader = csv.DictReader(input,fieldnames=fieldnames) #self._fieldnames = fieldnames   # list of keys for the dict 以list的形式存放键名
    for row in csv_reader:
        d={}
        for k,v in row.items():
            d[k]=v
        #print(d)

        rating = d['stars']
        text = d['text'].replace("\n", " ")
        text = string_formatting(text)

        # insert extra attributes
        # text = "{} __funny{}__ __useful{}__ __cool{}__".format(text, d['funny'], d['useful'], d['cool'])

        # remove stop words
        """
        words = text.split()
        text = ""
        for r in words:
          if not r in stop_words:
            text += r
            text += " "
        """

        fasttext_line = "__label__{} {}".format(rating, text)

        # Return the next random floating point number in the range [0.0, 1.0)
        if random.random() <= percent_test_data:
            test_output.write(fasttext_line + "\n")
        else:
            train_output.write(fasttext_line + "\n")

In [None]:
# Print file size in MB
file_size = os.stat('/content/fasttext_dataset_train.txt').st_size/1e+6
print(f'fasttext_dataset_train, file size is: {file_size} MB \n')

file_size = os.stat('/content/fasttext_dataset_test.txt').st_size/1e+6
print(f'fasttext_dataset_test, file size is: {file_size} MB \n')

fasttext_dataset_train, file size is: 5.295942 MB 

fasttext_dataset_test, file size is: 0.609682 MB 



# Train the Model
* We have now separate dataset for training and testing.
* Now lets train the model using **fasttext_dataset_train.txt** file.

\* In order to train a text classifier, we can use fasttext.train_supervised function like this:

In [None]:
import fasttext

model = fasttext.train_supervised('fasttext_dataset_train.txt', wordNgrams = 1, epoch = 30)

In [None]:
# Once the model is trained, we can retrieve the list of words and labels
print(model.words[:20]) # Printing first 20 words
print(model.labels)

['.', 'the', ',', 'and', 'i', 'a', 'to', "'", 'was', 'it', 'of', 'for', 'is', 'in', '!', 'my', 'that', '</s>', 'with', 'they']
['__label__5.0', '__label__4.0', '__label__1.0', '__label__3.0', '__label__2.0']


# Model Score
To evaluate our model by computing the precision at 1 (P@1) and the recall on a test set, we use the test function

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('fasttext_dataset_test.txt'))

N	929
P@1	0.623
R@1	0.623


We can also check how often the correct star rating was in one of it’s Top 2 predictions (i.e. if the model’s top two most likely guesses were “5”, “4” and the real user said “4”):

In [None]:
print_results(*model.test('fasttext_dataset_test.txt', 2))

N	929
P@1	0.438
R@1	0.876


## MSE
This function computes MSE. MSE is used as the main metric for this project.

In [None]:
def print_mse(_model, path):
    mse = 0
    count = 0
    print(path)
    y_pred = []
    y_test = []
    with open(path, "r") as f:
      lines = f.readlines()
      for line in lines:
        count += 1
        label = line[:12]  # __label__5.0 
        #print("label: ", label)
        line = line[13:]
        line = line.replace("\n", " ")
        predicted = _model.predict(string_formatting(line))[0][0]
        #print("predicted: ", predicted)
        mse += (int(predicted[9]) - int(label[9]) )**2
        y_pred.append(int(predicted[9]))
        y_test.append(int(label[9]))
    mse /= count
    print("count: ", count)
    print("my MSE: ", mse)
    print("sklearn MSEL ", mean_squared_error(y_test, y_pred))
print_mse(model, "/content/fasttext_dataset_test.txt")


/content/fasttext_dataset_test.txt
count:  929
my MSE:  0.7438105489773951
sklearn MSEL  0.7438105489773951


# Model Testing
* We can also test our model prediction for any new reviews.
* We have to use model.predict method and provide the review text as input.
* By default, predict returns only one label : the one with the highest probability. You can also predict more than one label by specifying the parameter k
* In order to icrease the prediction accuracy we have to format the input text, just like the training data.