## Build Your First Text Classifier in Python with Logistic Regression

使用LR解决文本多分类问题。

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
df = pd.read_json('news_category_dataset.json', lines=True)
len(df)

124989

In [13]:
def tokenize_url(url):
    url = url.replace('https://www.huffingtonpost.com/entry/', '')
    url = re.sub('(\W|_)+', ' ', url)
    return url


def extract_features(df, field, training_data, testing_data, type="binary"):
    """Extract features using different methods"""
    if "binary" in type:
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set, test_feature_set, cv
  
    elif "counts" in type:
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set, test_feature_set, cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set, test_feature_set, tfidf_vectorizer
    
    
def get_top_k_predictions(model, X_test, k):
    # get probabilities instead of predicted labels, since we want to collect top 3
    probs = model.predict_proba(X_test)

    # GET TOP K PREDICTIONS BY PROB - note these are just index
    best_n = np.argsort(probs, axis=1)[:, -k:]

    # GET CATEGORY OF PREDICTIONS
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]

    # REVERSE CATEGORIES - DESCENDING ORDER OF IMPORTANCE
    preds=[ item[::-1] for item in preds]
    
    return preds

In [6]:
df['tokenized_url'] = df.link.apply(tokenize_url)
df['text_desc'] = df.short_description
df['text_desc_headline'] = df.short_description + ' ' + df.headline
df['text_desc_headline_url'] = df.short_description + ' ' + df.headline + ' ' + df.tokenized_url

In [8]:
train_data, test_data = train_test_split(df, random_state=2000)

Y_train=train_data['category'].values
Y_test=test_data['category'].values

In [9]:
field = 'text_desc'
feature_rep = 'binary'
top_k = 3

X_train, X_test, feature_transformer = extract_features(df, 'text_desc_headline', train_data, test_data, type='binary')

In [10]:
scikit_lr = LogisticRegression(verbose=1, solver='liblinear', random_state=0, C=5, penalty='l2', max_iter=1000)
model = scikit_lr.fit(X_train, Y_train)



[LibLinear]