In [1]:
from model import NLPModel
import pandas as pd
from sklearn.model_selection import train_test_split

'''
Create the model object
The NLP model object uses a Naive Bayes classifier and a TFIDF vectorizer:
self.clf = MultinomialNB()
self.vectorizer = TfidfVectorizer()
'''

def build_model():
    model = NLPModel()
    with open ('data/train.tsv') as f:
        data = pd.read_csv(f, sep='\t')
    
    # Use only the 1 star and 5 star reviews
    # For this example, we want to only predict positive or negative sentiment using the extreme cases.
    pos_neg = data[(data['Sentiment']==0) | (data['Sentiment']==4)]
    
    ## Relabel as 0 for negative and 1 for positive¶
    pos_neg['Binary'] = pos_neg.apply(
        lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1)
    
    #Fit a vectorizer to the vocabulary in the dataset
    #pos_neg.loc[:, 'Phrase']
        
    X = model.vectorizer_fit_transform(pos_neg.loc[:, 'Phrase'])
    print('Vectorizer fit transform complete')
    
    y = pos_neg.loc[:, 'Binary']
    
    # split X and y into training and testing sets
    # by default, it splits 75% training and 25% test
    # random_state=1 for reproducibility
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    #print(X_train.shape)
    #print(X_test.shape)
    #print(y_train.shape)
    #print(y_test.shape)
    model.train(X_train, y_train)
    print('Model training complete')
        
    model.pickle_clf()
    model.pickle_vectorizer()

        
if __name__ == "__main__":
    build_model()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Vectorizer fit transform complete
Model training complete
Pickled classifier at lib/models/SentimentClassifier.pkl
Pickled vectorizer at lib/models/TFIDFVectorizer.pkl
