In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
from time import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train.shape
test.shape

(1306122, 3)

(56370, 2)

In [3]:
test['target'] = -1
df = pd.concat([train, test], axis=0)
df.reset_index(drop=True, inplace=True)

df.shape

(1362492, 3)

In [4]:
df['upper_num'] = df.question_text.apply(lambda x: len([i for i in x if i.isupper()]))
df['symbol_num'] = df.question_text.apply(lambda x: len([i for i in x if i in '?!.,;[]{}:<>()@#$%^&*/+-']))

In [5]:
import gensim
import nltk
import re

In [6]:
stopwords = ["a","about","above","after","again","against","all","am","an","and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]
len(stopwords)

174

In [7]:
# 去除停用词、符号
def sent2words(sentence):
    words = []
    for w in wordNormal(sentence):
        if w not in stopwords and w != '' and re.compile(r'[1-9]\d*\.\d*|0\.\d*[1-9]|[1-9]\d*').findall(w) == []:
            words.append(w)
    return words

In [8]:
# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None

def wordNormal(sentence):
    sentence = re.sub("[+\.\!\/_,$%^*(+\"\'\°]+|[—():~?]+{}“”&’：；;\[\]‘", '', sentence)
    sentence = re.sub("-", ' ', sentence)
    tokens = nltk.word_tokenize(sentence)
    tagged_sent = nltk.pos_tag(tokens)
    
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or nltk.corpus.wordnet.NOUN
        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    return lemmas_sent

In [9]:
%%time
df.question_text = df.question_text.apply(lambda x: sent2words(x))
df.question_text = df.question_text.apply(lambda x: [i.lower() for i in x])

Wall time: 2h 58min 41s


In [10]:
country = ["Afghanistan","Anguilla","Armenia","Argentina","Aruba","Australia","Austria","Azerbaijan","Bahamas","Bahrain","Bangladesh","Barbados","Belarus","Belgium","Belize","Benin","Bermuda","Bhutan","Bolivia","Bouvet Islands","Brazil","British Indian Ocean Territory","British Virgin Islands","Brunei","Bulgaria","Burkina Faso","Burundi","Cambodia","Cameroon","Canada","Cape Verde","Cayman Islands","Central African Republic","Chad","Chile","China","Colombia","Comoros","Congo","Costa Rica","Cote D'Ivorie","Croatia","Cyprus","Czech Republic","Denmark","Djibouti","Dominica","Dominican Republic","Egypt","El Salvador","Equador","Equatorial Guinea","Eritrea","Estonia","Ethiopia","Falkland Islands","Faroe Islands","Federated States of Micronesia","Fiji","Finland","France","French Guiana","French Polynesia","Gabon","Gambia","Georgia","Germany","Ghana","Gibraltar","Greece","Greenland","Grenada","Guadeloupe","Guam","Guatemala","Guinea","Guinea- Bissau","Guyana","Haiti","Honduras","Hong Kong","Hungary","Iceland","India","Indonesia","Republic of Ireland","Israel","Italy","Jamaica","Japan","Jordan","Kazakhstan","Kenya","Kiribati","Kuwait","Kyrgyzstan","Laos","Latvia","Lebanon","Lesotho","Liberia","Liechtenstein","Lithuania","Luxembourg","Macau","Madagascar","Malawi","Malaysia","Maldives","Mali","Malta","Marshall Islands","Martinique","Mauritania","Mayotte","Metropolitan France","Mexico","Moldova","Mongolia","Morocco","Mozambique","Namibia","Nauru","Nepal","Neterlands Antilles","Netherlands","New Caledonia","New Zealand","Nicaragua","Niger","Nigeria","Northern Mariana Islands","Norway","Oman","Pakistan","Palau","Panama","Papua New Guinea","Paraguay","Peru","Philippines","Pitcairn","Poland","Portugal","Puerto Rico","Qatar","Republic of Korea","Republic of Macedonia","Reunion","Romania","Russia","Sao Tome and Principe","Saudi Arabia","Senegal","Seychelles","Singapore","Slovakia","Slovenia","Solomon Islands","Somalia","South Africa","Spain","Sri Lanka","St. Helena","St. Kitts and Nevis","St. Lucia","St. Vincent and the Grenadines","Sudan","Suriname","Svalbard and Jan Mayen Islands","Swaziland","Sweden","Switzerland","Syria","Taiwan","Tajikistan","Tanzania","Thailand","Togo","Tonga","Trinidad and Tobago","Turkey","Turkmenistan","Turks and Caicos Islands","Tuvalu","Uganda","Ukraine","United Arab Emirates","United Kingdom","United States USA","Uruguay","Uzbekistan","Vanuatu","Vatican City","Venezuela","Vietnam","Western Sahara","Yemen","Yugoslavia","Zaire","Zambia","Zimbabwe","North Korea"]
country_short = ["AF","AI","AM","AR","AW","AU","AT","AZ","BS","BH","BD","BB","BY","BE","BZ","BJ","BM","BT","BO","BV","BR","IO","VI","BN","BG","BF","BI","KH","CM","CA","CV","KY","CF","TD","CL","CN","CO","KM","CG","CR","CI","HR","CY","CZ","DK","DJ","DM","DO","EG","SV","EC","GQ","ER","EE","ET","FK","FO","FM","FJ","FI","FR","GF","PF","GA","GM","GE","DE","GH","GI","GR","GL","GD","GP","GU","GT","GN","GW","GY","HT","HN","HK","HU","IS","IN","ID","IE","IL","IT","JM","JP","JO","KZ","KE","KI","KW","KG","LA","LV","LB","LS","LR","LI","LT","LU","MO","MG","MW","MY","MV","ML","MT","MH","MQ","MR","YT","FX","MX","MD","MN","MA","MZ","NA","NR","NP","AN","NL","NC","NZ","NI","NE","NG","MP","NO","OM","PK","PW","PA","PG","PY","PE","PH","PN","PL","PT","PR","QA","KR","MK","RE","RO","RU","ST","SA","SN","SC","SG","SK","SI","SB","SO","ZA","ES","LK","SH","KN","LC","VC","SD","SR","SJ","SZ","SE","CH","SY","TW","TJ","TZ","TH","TG","TO","TT","TR","TM","TC","TV","UG","UA","AE","GB","US","UY","UZ","VU","VA","VE","VN","EH","YE","YU","ZR","ZM","ZW"]
religion = ["Ahmadiyya","Aladura","Amish","Anglicanism","Asatru","Assemblies of God","Atheism","Baha'i Faith","Baptists","Bon","Buddhism","Candomble","Cao Dai","Cathari","Catholicism","Charismatic movement","Chinese Religion","Christadelphians","Christian Science","Christianity","Church of God","Church of God in Christ","Church of Satan","Confucianism","Conservative Judaism","Deism","Donatism","Dragon Rouge","Druze","Eastern Orthodox Church","Eckankar","ELCA","Epicureanism","Evangelicalism","Falun Gong","Foursquare Church","Gnosticism","Greek Religion","Hare Krishna","Hasidism","Hellenic Reconstructionism","Hinduism","Illuminati","Intelligent Design","Islam","Jainism","Jehovah's Witnesses","Judaism","Kabbalah","Kemetic Reconstructionism","Lutheranism","Mahayana Buddhism","Mayan Religion","Methodism","Mithraism","Mormonism","Neopaganism","New Age","New Thought","Nichiren","Norse Religion","Olmec Religion","Oneness Pentecostalism","Orthodox Judaism","Pentecostalism","Presbyterianism","Priory of Sion","Protestantism","Pure Land Buddhism","Quakers","Rastafarianism","Reform Judaism","Rinzai Zen Buddhism","Roman Religion","Satanism","Scientology","Seventh-Day Adventism","Shaivism","Shi'a Islam","Shinto","Sikhism","Soto Zen Buddhism","Spiritualism","Stoicism","Sufism","Sunni Islam","Taoism","Tendai Buddhism","Theravada Buddhism","Tibetan Buddhism","Typhonian Order","Umbanda","Unification Church","Unitarian Universalism","Vaishnavism","Vajrayana Buddhism","Vedanta","Vineyard Churches","Voodoo","Westboro Baptist Church","Wicca","Worldwide Church of God","Yezidi","Zen","Zionism","Zoroastrianism"]

country = [i.lower() for i in country]
country_short = [i.lower() for i in country_short]
religion = [i.lower() for i in religion]

In [14]:
%%time
df['has_country_num'] = df.question_text.apply(lambda x: len(set(country) & set(x)))
df['has_country_short_num'] = df.question_text.apply(lambda x: len(set(country_short) & set(x)))
df['has_religion_num'] = df.question_text.apply(lambda x: len(set(religion) & set(x)))

Wall time: 15.7 s


## 异常问题

In [20]:
from collections import defaultdict

In [17]:
df.target.value_counts()
df[df.target == 1].head()

 0    1225312
 1      80810
-1      56370
Name: target, dtype: int64

Unnamed: 0,qid,question_text,target,upper_num,symbol_num,has_country_num,has_country_short_num,has_religion_num
22,0000e91571b60c2fb487,"[has, united, states, become, large, dictators...",1,3,1,0,0,0
30,00013ceca3f624b09f42,"[which, baby, sweet, parent, dark, skin, baby,...",1,2,2,0,0,0
110,0004a7fcb2bf73076489,"[if, black, support, school, choice, mandatory...",1,2,1,0,0,0
114,00052793eaa287aff1e1,"[i, gay, boy, i, love, cousin, boy, he, sexy, ...",1,8,10,0,0,0
115,000537213b01fd77b58a,"[which, race, small, penis]",1,1,1,0,0,0


In [21]:
except_word_dict = defaultdict(lambda : 0)

for i in tqdm_notebook(df[df.target == 1].question_text):
    for word in i:
        except_word_dict[word] += 1

A Jupyter Widget




In [32]:
word_dict = defaultdict(lambda : 0)

for i in tqdm_notebook(df.question_text):
    for word in i:
        word_dict[word] += 1

A Jupyter Widget




In [34]:
except_word_dict_rate = defaultdict(lambda : 0)

for i in tqdm_notebook(except_word_dict.keys()):
    except_word_dict_rate[i] = except_word_dict[i] / word_dict[i]

A Jupyter Widget




In [54]:
except_word = ['woman','trump','men','white','muslims','black','girl','india','indian','americans','us','sex','indians','liberal','chinese','muslim','american','president','child','gay','america','donald','old','jews','man','racist','god','government','china','obama','christians','democrats','hindus','state','religion','human','stupid','islam','usa','modi','pakistan','israel','conservative','anti','male','atheist','terrorist','money','rape','female','hillary','british','asian','russia','united','jewish','african','sexual','feminist','western','hindu','immigrant','fuck','clinton','europe','christian','kid','penis','uk','arent','japanese','force','republicans','russian','trumps','europeans','bjp','religious','political','hitler','european','west','asians','democrat','castrate','pakistani','republican','jesus','minority','enough','democratic','slave','dumb','palestinians','korea','islamic','abuse','suck','liberals','homosexual','tamil','pakistanis','russians','shit','criminal','victim','holocaust','canada','iran','transgender','sexually','terrorism','leftist','africans','africa','foreign','christianity','israeli','korean','homosexuality','jew','masturbate','japan','kashmir','quorans','gandhi','syria','nazis','australia','germans','french','girls','drug','britain','fbi','german','brahmins','eu','muhammad','vagina','canadians','lgbt','barack','turkish','koreans','pussy','england','brainwash','mexico','quran','asshole','hinduism','vietnamese','ive','mexicans','sexist','palestinian','christ','france','sweden','narcissist','obamas','karnataka','harry','bangladesh','naked','punishment','california','iranians','bengali','rahul','mexican','union','kashmiri','pedophile','jerusalem','harass','delhi','nationalist','arabia','nepal','iraq','australians','gods','australian','prostitute','italians','canadian','bengal','dalits','catholics','tamilians','xi','melania','jinping','cnn']
except_word_rate1 = [i[0] for i in sorted(except_word_dict_rate.items(), key=lambda x:x[1], reverse=True) if (i[1] == 1) and (word_dict[i[0]] > 1)]
except_word_rate2 = [i[0] for i in sorted(except_word_dict_rate.items(), key=lambda x:x[1], reverse=True) if (i[1] == 1) and (word_dict[i[0]] > 2)]
len(except_word_rate1)
len(except_word_rate2)

594

152

In [57]:
%%time
# df['has_except_word_num'] = df.question_text.apply(lambda x: len(set(except_word) & set(x)))
df['has_except_word_rate1_num'] = df.question_text.apply(lambda x: len(set(except_word_rate1) & set(x)))
df['has_except_word_rate2_num'] = df.question_text.apply(lambda x: len(set(except_word_rate2) & set(x)))

Wall time: 26.1 s


In [47]:
df[(df.has_except_word_rate_num != 0) & (df.target == 1)].shape
df[(df.has_except_word_rate_num == 0) & (df.target == 1)].shape

(6912, 10)

(73898, 10)

In [58]:
df.head()

Unnamed: 0,qid,question_text,target,upper_num,symbol_num,has_country_num,has_country_short_num,has_religion_num,has_except_word_num,has_except_word_rate1_num,has_except_word_rate2_num
0,00002165364db923c7e6,"[how, quebec, nationalist, see, province, nation]",0,2,1,0,0,0,1,0,0
1,000032939017120e6e44,"[do, adopt, dog, encourage, people, adopt, shop]",0,1,2,0,1,0,0,0,0
2,0000412ca6e4628ce2cf,"[why, velocity, affect, time, does, velocity, ...",0,2,2,0,0,0,0,0,0
3,000042bf85aa498cd78e,"[how, otto, von, guericke, use, magdeburg, hem...",0,4,1,0,0,0,0,0,0
4,0000455dfa3e01eae3af,"[can, i, convert, montra, helicon, d, mountain...",0,3,1,0,0,0,0,0,0
