- Docu: https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/quick-start
- Rules: https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/integrate/build-a-rule

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import requests
import os
import json
import re
import tweepy as tw
# ML
import pandas as pd
from joblib import dump, load
import nltk
from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords 
nltk.download("stopwords", "stopwords")
nltk.data.path.append('stopwords/')
nltk.download("wordnet", "wordnet")
nltk.data.path.append('wordnet/')
stop_words = set(stopwords.words('english'))

# Docu: https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/quick-start
# Rules: https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/integrate/build-a-rule

# ML Algo
def remove_URL(x):
    return re.sub(r"http\S+", "", x)

def tokenize(x):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(x.lower())

def tokenize_remove_regex(x):
    listToStr = ' '.join([str(elem) for elem in x]) 
    tokenizer = RegexpTokenizer(r'http|2019|2018|cve|2020| |\.|,|:|;|!|\?|\(|\)|\||\+|\'|"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<|\/|\[|\]', gaps=True)
    return tokenizer.tokenize(listToStr)

def stemmer(x):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in x])
 
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in x])


# TW API
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def get_rules(headers, bearer_token):
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream/rules", headers=headers
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print(json.dumps(response.json()))
    return response.json()


def delete_all_rules(headers, bearer_token, rules):
    if rules is None or "data" not in rules:
        return None

    ids = list(map(lambda rule: rule["id"], rules["data"]))
    payload = {"delete": {"ids": ids}}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot delete rules (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    print(json.dumps(response.json()))


def set_rules(headers, delete, bearer_token):
    # You can adjust the rules if needed
    sample_rules = [
        {'value': '"CVE-" -is:retweet -is:reply -from:gDogLiveStream -from:vigilance_fr -from:threatintelctr -from:vigilance_en -from:www_sesin_at -from:WolfgangSesin -from:CVEreport -from:LinInfoSec',
         'tag': 'Exploits'},
    ]
    payload = {"add": sample_rules}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload,
    )
    if response.status_code != 201:
        raise Exception(
            "Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print(json.dumps(response.json()))


def get_stream(headers, set, bearer_token):
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream", headers=headers, stream=True,
    )
    prog = re.compile('CVE-\d{4}-\d{4,7}', re.IGNORECASE)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Cannot get stream (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    try:
        for response_line in response.iter_lines():
            if response_line:
                json_response = json.loads(response_line)
                if prog.search(json_response['data']['text']):
                    print(json_response['data']['text'])
                    print(json_response['data']['id'])
                    gs_mnnb = load('TW_exploit_detect_NLP_Modelv1.joblib')
                    text = json_response['data']['text']
                    dfpre = ""
                    dfpre = pd.DataFrame({'Tweet': text}, index=[0])
                    dfpre['Tweet'] = dfpre['Tweet'].map(remove_URL)
                    dfpre['tokens'] = dfpre['Tweet'].map(tokenize)
                    dfpre['tokens'] = dfpre['tokens'].map(tokenize_remove_regex)
                    dfpre['lemma'] = dfpre['tokens'].map(lemmatize)
                    dfpre['stems'] = dfpre['tokens'].map(stemmer)
                    result = gs_mnnb.predict(dfpre['stems'])
                    print("Result:", result)
                    print(json.dumps(json_response, indent=4, sort_keys=True))
                    TWITTER_APP_KEY = "IRZrgWXklIW6gFPRkEnBC5J49"
                    TWITTER_APP_SECRET = "rhiuD8WkXJ9XffWc0rXNjNX0yJ8xJHW0s2LyQoHfbiYMRETEpp"
                    access_token = "1334890263492370445-1vDVCBmGQsXcAnOUjdI5S1qbU2HX37"
                    access_token_secret = "Rm2ZR9syz2cp8FvSDYHVF677HlEjBO6W8aHuDAncu2TJQ"
                    auth = tw.OAuthHandler(TWITTER_APP_KEY, TWITTER_APP_SECRET)
                    auth.set_access_token(access_token, access_token_secret)
                    twapi = tw.API(auth)
                    if result == 1:
                        twapi.retweet(json_response['data']['id'])
    except:
        pass


def main():
    bearer_token =                                      "AAAAAAAAAAAAAAAAAAAAAB6DKQEAAAAAayqDbgt8ktkkPB22ci3k0JILT18%3DYyybAmEvnwsbhPasKNJS7Fi5QOIorCT2BZUqa0ZE61wwXpHv4x"
    headers = create_headers(bearer_token)
    rules = get_rules(headers, bearer_token)
    delete = delete_all_rules(headers, bearer_token, rules)
    set = set_rules(headers, delete, bearer_token)
    get_stream(headers, set, bearer_token)


if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to stopwords...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to wordnet...
[nltk_data]   Package wordnet is already up-to-date!


{"data": [{"id": "1377599583598092288", "value": "\"CVE-\" -is:retweet -is:reply -from:gDogLiveStream -from:vigilance_fr -from:threatintelctr -from:vigilance_en -from:www_sesin_at -from:WolfgangSesin -from:CVEreport -from:LinInfoSec", "tag": "Exploits"}], "meta": {"sent": "2021-04-01T12:50:46.072Z"}}
{"meta": {"sent": "2021-04-01T12:50:47.598Z", "summary": {"deleted": 1, "not_deleted": 0}}}
{"data": [{"value": "\"CVE-\" -is:retweet -is:reply -from:gDogLiveStream -from:vigilance_fr -from:threatintelctr -from:vigilance_en -from:www_sesin_at -from:WolfgangSesin -from:CVEreport -from:LinInfoSec", "tag": "Exploits", "id": "1377604356711796736"}], "meta": {"sent": "2021-04-01T12:50:49.251Z", "summary": {"created": 1, "not_created": 0, "valid": 1, "invalid": 0}}}
200
Hi, I'm CVE-2021-29939.  I was never good with numbers though, so you can call me Zealous Dove
https://t.co/Y3lXu67qvW
1377604720072794115
Result: [0]
{
    "data": {
        "id": "1377604720072794115",
        "text": "Hi, I'm 

CVE-2021-1699 Windows (modem.sys)信息泄漏漏洞
https://t.co/hsM7249gsx
1377618799877324814
Result: [1]
{
    "data": {
        "id": "1377618799877324814",
        "text": "CVE-2021-1699 Windows (modem.sys)\u4fe1\u606f\u6cc4\u6f0f\u6f0f\u6d1e\nhttps://t.co/hsM7249gsx"
    },
    "matching_rules": [
        {
            "id": 1377604356711796736,
            "tag": "Exploits"
        }
    ]
}
