In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import string
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc,precision_recall_curve
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV

train = pd.read_csv("train5.csv")
test = pd.read_csv("test5.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avitr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Define the function for Task 1

def create_model(train_docs, train_y, test_docs, test_y, \
              model_type='svm', stop_words=None, min_df = 1, print_result = True, algorithm_para=1.0):
    
    
    f = lambda x: ''.join([i for i in x if i not in string.punctuation]).lower()

    train_docs = train_docs.apply(f)
    test_docs = test_docs.apply(f)

    vectorizer = TfidfVectorizer(stop_words=stop_words,min_df=min_df)

    vectorizer.fit(train_docs)
    train_docs = vectorizer.transform(train_docs)
    train_docs = train_docs.toarray()

    # model building

    if model_type=='svm':
        clf = svm.SVC(kernel='linear', probability=False,C=algorithm_para)
    else:
        clf = MultinomialNB(alpha=algorithm_para)

    clf.fit(train_docs, train_y)
    test_docs = vectorizer.transform(test_docs)
    test_docs = test_docs.toarray()
    y_pred = clf.predict(test_docs)
    
    if print_result:
        print(classification_report(test_y,y_pred))
        
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    if model_type=='svm':
        y_score = clf.decision_function(test_docs)
    else:
        y_score = clf.predict_proba(test_docs)
        
    if model_type=='svm':
        for i in range(2):
            fpr[i], tpr[i], _ = roc_curve(test_y, y_score)
            roc_auc[i] = auc(fpr[i], tpr[i])
        lr_precision, lr_recall, _ = precision_recall_curve(test_y, y_score)
    else:
        for i in range(2):
            fpr[i], tpr[i], _ = roc_curve(test_y, y_score[:,1])
            roc_auc[i] = auc(fpr[i], tpr[i])
        lr_precision, lr_recall, _ = precision_recall_curve(test_y, y_score[:,1])

    

    auc_score = roc_auc[1]
    prc_score = lr_precision.mean()
            
    print('AUC: {:.2f}% PRC: {:.2f}%'.format(auc_score*100,prc_score*100))
    
    plt.figure()
    lw = 2
    plt.plot(
        fpr[1],
        tpr[1],
        color="darkorange",
        lw=lw,
        label="ROC curve (area = %0.2f)" % roc_auc[1],
    )
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_type} AUC')
    plt.show()
    
    plt.plot(lr_recall, lr_precision, marker='.', label='Logistic')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{model_type} PRC')
    plt.show()
    
    return auc_score, prc_score


In [None]:
pipeline = Pipeline(
[
    ("tfidf", TfidfVectorizer()),
    ("clf", svm.SVC),
]
)
param_grid = { 
'tfidf__stop_words': ['english'],# None
'tfidf__min_df': [1],#,2,5],
'clf__C': [0.2]#,0.5,0.8]
}

grid_search = GridSearchCV(pipeline,param_grid,cv= 2, n_jobs=-1, verbose=3)
grid_search.fit(train["text"][:10], train["label"][:10])
best_parameters = grid_search.best_estimator_()
print(best_parameters)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


In [5]:
a = {2}

In [6]:
a

{2}

In [7]:
a[0]

TypeError: 'set' object is not subscriptable

In [24]:
mylist = [(2,4,1),(2,14,5),(12,3,15),(1,2,4),(3,4,3),(4,1,2)]

In [25]:
mylist

[(2, 4, 1), (2, 14, 5), (12, 3, 15), (1, 2, 4), (3, 4, 3), (4, 1, 2)]

In [26]:
sorted(mylist)

[(1, 2, 4), (2, 4, 1), (2, 14, 5), (3, 4, 3), (4, 1, 2), (12, 3, 15)]

In [17]:
sorted(mylist,key=lambda x: x[0])

[(1, 2, 4), (2, 3, 1), (3, 4, 3), (4, 1, 2), (12, 3, 15)]

In [15]:
sorted(mylist,key=lambda x: x[1])

[(4, 1, 2), (1, 2, 4), (2, 3, 1), (12, 3, 15), (3, 4, 3)]

In [18]:
sorted(mylist,key=lambda x: x[2])

[(2, 3, 1), (4, 1, 2), (3, 4, 3), (1, 2, 4), (12, 3, 15)]